aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
-rw-r--r--GPL/tools/trex/trex_astf_assert.py2
-rw-r--r--GPL/tools/trex/trex_astf_profile.py2
-rw-r--r--GPL/tools/trex/trex_astf_stop.py2
-rw-r--r--GPL/tools/trex/trex_stl_assert.py2
-rw-r--r--GPL/tools/trex/trex_stl_profile.py2
-rw-r--r--GPL/tools/trex/trex_stl_stop.py2
-rw-r--r--GPL/traffic_profiles/trex/profile_trex_astf_base_class.py2
-rw-r--r--VPP_DEVICE_IMAGE_CENTOS1
-rw-r--r--VPP_DEVICE_IMAGE_UBUNTU2
-rw-r--r--VPP_DEVICE_IMAGE_UBUNTU_ARM2
-rw-r--r--VPP_STABLE_VER_CENTOS2
-rw-r--r--VPP_STABLE_VER_UBUNTU_BIONIC2
-rw-r--r--requirements.txt6
-rw-r--r--resources/libraries/bash/entry/per_patch_device.sh3
-rw-r--r--resources/libraries/bash/function/ansible.sh16
-rw-r--r--resources/libraries/python/Constants.py8
-rw-r--r--resources/libraries/python/ContainerUtils.py18
-rw-r--r--resources/libraries/python/PapiExecutor.py12
-rw-r--r--resources/libraries/python/QemuUtils.py1
-rw-r--r--resources/tools/presentation/conf.py2
-rw-r--r--resources/tools/presentation/conf_cpta/conf.py2
-rw-r--r--resources/tools/presentation/input_data_files.py150
-rw-r--r--resources/tools/presentation/requirements.txt6
-rwxr-xr-xresources/tools/presentation/run_cpta.sh2
-rwxr-xr-xresources/tools/presentation/run_report.sh1
-rw-r--r--resources/tools/presentation/specification.yaml10
-rw-r--r--resources/tools/presentation/specification_CPTA.yaml7
-rw-r--r--resources/tools/presentation/specification_local.yaml4
-rw-r--r--resources/tools/testbed-setup/ansible/dev.yaml6
-rw-r--r--resources/tools/testbed-setup/ansible/inventories/lf_inventory/host_vars/10.30.51.13.yaml32
-rw-r--r--resources/tools/testbed-setup/ansible/inventories/lf_inventory/host_vars/10.30.51.14.yaml32
-rw-r--r--resources/tools/testbed-setup/ansible/inventories/lf_inventory/host_vars/10.30.51.28.yaml10
-rw-r--r--resources/tools/testbed-setup/ansible/inventories/lf_inventory/host_vars/10.30.51.29.yaml10
-rw-r--r--resources/tools/testbed-setup/ansible/inventories/lf_inventory/host_vars/10.30.51.30.yaml10
-rw-r--r--resources/tools/testbed-setup/ansible/inventories/lf_inventory/host_vars/10.30.51.32.yaml10
-rw-r--r--resources/tools/testbed-setup/ansible/inventories/lf_inventory/host_vars/10.30.51.33.yaml10
-rw-r--r--resources/tools/testbed-setup/ansible/inventories/lf_inventory/host_vars/10.30.51.34.yaml10
-rw-r--r--resources/tools/testbed-setup/ansible/inventories/lf_inventory/host_vars/10.30.51.35.yaml10
-rw-r--r--resources/tools/testbed-setup/ansible/inventories/lf_inventory/host_vars/10.30.51.39.yaml12
-rw-r--r--resources/tools/testbed-setup/ansible/inventories/lf_inventory/host_vars/10.30.51.40.yaml12
-rw-r--r--resources/tools/testbed-setup/ansible/inventories/lf_inventory/host_vars/10.30.51.49.yaml5
-rw-r--r--resources/tools/testbed-setup/ansible/inventories/lf_inventory/host_vars/10.30.51.50.yaml13
-rw-r--r--resources/tools/testbed-setup/ansible/inventories/lf_inventory/host_vars/10.30.51.51.yaml13
-rw-r--r--resources/tools/testbed-setup/ansible/inventories/lf_inventory/host_vars/10.30.51.65.yaml12
-rw-r--r--resources/tools/testbed-setup/ansible/inventories/lf_inventory/host_vars/10.30.51.66.yaml12
-rw-r--r--resources/tools/testbed-setup/ansible/inventories/lf_inventory/host_vars/10.30.51.67.yaml12
-rw-r--r--resources/tools/testbed-setup/ansible/inventories/lf_inventory/host_vars/10.30.51.68.yaml10
-rw-r--r--resources/tools/testbed-setup/ansible/inventories/lf_inventory/host_vars/10.30.51.69.yaml2
-rw-r--r--resources/tools/testbed-setup/ansible/inventories/lf_inventory/host_vars/10.30.51.70.yaml10
-rw-r--r--resources/tools/testbed-setup/ansible/inventories/lf_inventory/host_vars/10.30.51.71.yaml10
-rw-r--r--resources/tools/testbed-setup/ansible/inventories/lf_inventory/host_vars/10.32.8.14.yaml15
-rw-r--r--resources/tools/testbed-setup/ansible/inventories/lf_inventory/host_vars/10.32.8.15.yaml13
-rw-r--r--resources/tools/testbed-setup/ansible/inventories/lf_inventory/host_vars/10.32.8.16.yaml13
-rw-r--r--resources/tools/testbed-setup/ansible/inventories/lf_inventory/host_vars/10.32.8.17.yaml13
-rw-r--r--resources/tools/testbed-setup/ansible/inventories/lf_inventory/hosts1
-rw-r--r--resources/tools/testbed-setup/ansible/nomad.yaml16
-rw-r--r--resources/tools/testbed-setup/ansible/roles/ab/defaults/main.yaml7
-rw-r--r--resources/tools/testbed-setup/ansible/roles/ab/tasks/main.yaml15
-rw-r--r--resources/tools/testbed-setup/ansible/roles/aws/tasks/main.yaml57
-rw-r--r--resources/tools/testbed-setup/ansible/roles/aws/tasks/ubuntu_bionic.yaml14
-rw-r--r--resources/tools/testbed-setup/ansible/roles/cadvisor/defaults/main.yaml24
-rw-r--r--resources/tools/testbed-setup/ansible/roles/cadvisor/tasks/main.yaml39
-rw-r--r--resources/tools/testbed-setup/ansible/roles/calibration/defaults/main.yaml32
-rw-r--r--resources/tools/testbed-setup/ansible/roles/calibration/tasks/main.yaml14
-rw-r--r--resources/tools/testbed-setup/ansible/roles/cleanup/tasks/clean_images.yaml36
-rw-r--r--resources/tools/testbed-setup/ansible/roles/cleanup/tasks/kill_containers.yaml20
-rw-r--r--resources/tools/testbed-setup/ansible/roles/cleanup/tasks/kill_process.yaml26
-rw-r--r--resources/tools/testbed-setup/ansible/roles/cleanup/tasks/main.yaml18
-rw-r--r--resources/tools/testbed-setup/ansible/roles/cleanup/tasks/nomad.yaml22
-rw-r--r--resources/tools/testbed-setup/ansible/roles/cleanup/tasks/remove_package.yaml24
-rw-r--r--resources/tools/testbed-setup/ansible/roles/cleanup/tasks/sut.yaml57
-rw-r--r--resources/tools/testbed-setup/ansible/roles/cleanup/tasks/tg.yaml16
-rw-r--r--resources/tools/testbed-setup/ansible/roles/cleanup/tasks/vpp_device.yaml39
-rw-r--r--resources/tools/testbed-setup/ansible/roles/common/defaults/main.yaml53
-rw-r--r--resources/tools/testbed-setup/ansible/roles/common/handlers/main.yaml2
-rw-r--r--resources/tools/testbed-setup/ansible/roles/common/tasks/main.yaml42
-rw-r--r--resources/tools/testbed-setup/ansible/roles/consul/tasks/main.yaml24
-rw-r--r--resources/tools/testbed-setup/ansible/roles/consul/templates/base.hcl.j26
-rw-r--r--resources/tools/testbed-setup/ansible/roles/consul/templates/telemetry.hcl.j23
-rw-r--r--resources/tools/testbed-setup/ansible/roles/csit_shim_image/files/Dockerfile61
-rw-r--r--resources/tools/testbed-setup/ansible/roles/csit_shim_image/files/files/badkeypub1
-rw-r--r--resources/tools/testbed-setup/ansible/roles/csit_shim_image/files/files/sshconfig3
-rw-r--r--resources/tools/testbed-setup/ansible/roles/csit_shim_image/files/files/wrapdocker113
-rw-r--r--resources/tools/testbed-setup/ansible/roles/csit_shim_image/tasks/main.yaml32
-rw-r--r--resources/tools/testbed-setup/ansible/roles/csit_sut_image/files/Dockerfile93
-rw-r--r--resources/tools/testbed-setup/ansible/roles/csit_sut_image/tasks/main.yaml11
-rw-r--r--resources/tools/testbed-setup/ansible/roles/docker/defaults/main.yaml4
-rw-r--r--resources/tools/testbed-setup/ansible/roles/docker/handlers/main.yaml2
-rw-r--r--resources/tools/testbed-setup/ansible/roles/docker/tasks/main.yaml12
-rw-r--r--resources/tools/testbed-setup/ansible/roles/docker/tasks/ubuntu_bionic.yaml11
-rw-r--r--resources/tools/testbed-setup/ansible/roles/docker/tasks/ubuntu_focal.yaml30
-rw-r--r--resources/tools/testbed-setup/ansible/roles/docker/templates/daemon.json.j21
-rw-r--r--resources/tools/testbed-setup/ansible/roles/dpdk/defaults/main.yaml14
-rw-r--r--resources/tools/testbed-setup/ansible/roles/dpdk/tasks/main.yaml34
-rw-r--r--resources/tools/testbed-setup/ansible/roles/iperf/defaults/main.yaml10
-rw-r--r--resources/tools/testbed-setup/ansible/roles/iperf/tasks/main.yaml34
-rw-r--r--resources/tools/testbed-setup/ansible/roles/jenkins_job_health_exporter/defaults/main.yaml35
-rw-r--r--resources/tools/testbed-setup/ansible/roles/jenkins_job_health_exporter/handlers/main.yaml9
-rw-r--r--resources/tools/testbed-setup/ansible/roles/jenkins_job_health_exporter/tasks/main.yaml38
-rw-r--r--resources/tools/testbed-setup/ansible/roles/jenkins_job_health_exporter/templates/jenkins-job-health-exporter.j216
-rw-r--r--resources/tools/testbed-setup/ansible/roles/jenkins_job_health_exporter/templates/jenkins-job-health-exporter.service.j213
-rw-r--r--resources/tools/testbed-setup/ansible/roles/kernel/defaults/main.yaml39
-rw-r--r--resources/tools/testbed-setup/ansible/roles/kernel/handlers/main.yaml2
-rw-r--r--resources/tools/testbed-setup/ansible/roles/kernel/tasks/main.yaml7
-rw-r--r--resources/tools/testbed-setup/ansible/roles/kernel/tasks/ubuntu_bionic.yaml31
-rw-r--r--resources/tools/testbed-setup/ansible/roles/kernel/tasks/ubuntu_focal.yaml51
-rw-r--r--resources/tools/testbed-setup/ansible/roles/kernel_vm/tasks/main.yaml44
-rw-r--r--resources/tools/testbed-setup/ansible/roles/kubernetes/tasks/ubuntu_bionic.yaml2
-rw-r--r--resources/tools/testbed-setup/ansible/roles/mellanox/defaults/main.yaml2
-rw-r--r--resources/tools/testbed-setup/ansible/roles/mellanox/tasks/main.yaml39
-rw-r--r--resources/tools/testbed-setup/ansible/roles/nomad/tasks/main.yaml10
-rw-r--r--resources/tools/testbed-setup/ansible/roles/performance_tuning/defaults/main.yaml7
-rw-r--r--resources/tools/testbed-setup/ansible/roles/performance_tuning/tasks/main.yaml103
-rw-r--r--resources/tools/testbed-setup/ansible/roles/performance_tuning/tasks/turbo_boost.yaml28
-rw-r--r--resources/tools/testbed-setup/ansible/roles/performance_tuning/tasks/ubuntu_bionic.yaml18
-rw-r--r--resources/tools/testbed-setup/ansible/roles/prometheus_exporter/defaults/main.yaml17
-rw-r--r--resources/tools/testbed-setup/ansible/roles/prometheus_exporter/files/blackbox.yml25
-rw-r--r--resources/tools/testbed-setup/ansible/roles/prometheus_exporter/handlers/main.yaml16
-rw-r--r--resources/tools/testbed-setup/ansible/roles/prometheus_exporter/tasks/main.yaml15
-rw-r--r--resources/tools/testbed-setup/ansible/roles/prometheus_exporter/tasks/ubuntu_bionic.yaml33
-rwxr-xr-xresources/tools/testbed-setup/ansible/roles/tg/files/csit-initialize-docker-tg.sh4
-rw-r--r--resources/tools/testbed-setup/ansible/roles/tg/tasks/main.yaml18
-rw-r--r--resources/tools/testbed-setup/ansible/roles/topology/tasks/main.yaml4
-rw-r--r--resources/tools/testbed-setup/ansible/roles/trex/defaults/main.yaml45
-rw-r--r--resources/tools/testbed-setup/ansible/roles/trex/tasks/deploy_block.yaml14
-rw-r--r--resources/tools/testbed-setup/ansible/roles/trex/tasks/main.yaml18
-rw-r--r--resources/tools/testbed-setup/ansible/roles/vpp/defaults/main.yaml25
-rw-r--r--resources/tools/testbed-setup/ansible/roles/vpp/tasks/main.yaml28
-rw-r--r--resources/tools/testbed-setup/ansible/site.yaml19
-rw-r--r--resources/tools/testbed-setup/ansible/site_aws.yaml8
-rw-r--r--resources/tools/testbed-setup/ansible/site_azure.yaml8
-rw-r--r--resources/tools/testbed-setup/ansible/sut.yaml75
-rw-r--r--resources/tools/testbed-setup/ansible/sut_aws.yaml25
-rw-r--r--resources/tools/testbed-setup/ansible/sut_azure.yaml25
-rw-r--r--resources/tools/testbed-setup/ansible/tg.yaml85
-rw-r--r--resources/tools/testbed-setup/ansible/tg_aws.yaml29
-rw-r--r--resources/tools/testbed-setup/ansible/tg_azure.yaml27
-rw-r--r--resources/tools/testbed-setup/ansible/vpp_device.yaml14
-rw-r--r--terraform-ci-infra/1n_nmd/alertmanager/conf/nomad/alertmanager.hcl333
-rw-r--r--terraform-ci-infra/1n_nmd/alertmanager/main.tf37
-rw-r--r--terraform-ci-infra/1n_nmd/alertmanager/variables.tf84
-rw-r--r--terraform-ci-infra/1n_nmd/exporter/conf/nomad/exporter.hcl587
-rw-r--r--terraform-ci-infra/1n_nmd/exporter/main.tf64
-rw-r--r--terraform-ci-infra/1n_nmd/exporter/variables.tf76
-rw-r--r--terraform-ci-infra/1n_nmd/grafana/conf/blackbox_exporter_http.json1030
-rw-r--r--terraform-ci-infra/1n_nmd/grafana/conf/blackbox_exporter_icmp.json368
-rw-r--r--terraform-ci-infra/1n_nmd/grafana/conf/consul.json1438
-rw-r--r--terraform-ci-infra/1n_nmd/grafana/conf/docker_cadvisor.json2040
-rw-r--r--terraform-ci-infra/1n_nmd/grafana/conf/node_exporter.json13696
-rw-r--r--terraform-ci-infra/1n_nmd/grafana/conf/nomad.json869
-rw-r--r--terraform-ci-infra/1n_nmd/grafana/conf/nomad/grafana.hcl331
-rw-r--r--terraform-ci-infra/1n_nmd/grafana/conf/prometheus.json3055
-rw-r--r--terraform-ci-infra/1n_nmd/grafana/main.tf24
-rw-r--r--terraform-ci-infra/1n_nmd/grafana/variables.tf66
-rw-r--r--terraform-ci-infra/1n_nmd/main.tf188
-rw-r--r--terraform-ci-infra/1n_nmd/minio/outputs.tf4
-rw-r--r--terraform-ci-infra/1n_nmd/prometheus/conf/nomad/prometheus.hcl569
-rw-r--r--terraform-ci-infra/1n_nmd/prometheus/main.tf37
-rw-r--r--terraform-ci-infra/1n_nmd/prometheus/variables.tf84
-rw-r--r--terraform-ci-infra/1n_nmd/terraform.tfstate829
-rw-r--r--terraform-ci-infra/1n_nmd/terraform.tfstate.backup831
-rw-r--r--terraform-ci-infra/1n_nmd/vpp_device/conf/nomad/csit_shim.hcl169
-rw-r--r--terraform-ci-infra/1n_nmd/vpp_device/main.tf19
-rw-r--r--terraform-ci-infra/1n_nmd/vpp_device/variables.tf31
164 files changed, 28553 insertions, 1038 deletions
diff --git a/GPL/tools/trex/trex_astf_assert.py b/GPL/tools/trex/trex_astf_assert.py
index d6d74bcff8..1a2e38810c 100644
--- a/GPL/tools/trex/trex_astf_assert.py
+++ b/GPL/tools/trex/trex_astf_assert.py
@@ -28,7 +28,7 @@ Functionality:
import sys
sys.path.insert(
- 0, u"/opt/trex-core-2.82/scripts/automation/trex_control_plane/interactive/"
+ 0, u"/opt/trex-core-2.88/scripts/automation/trex_control_plane/interactive/"
)
from trex.astf.api import *
diff --git a/GPL/tools/trex/trex_astf_profile.py b/GPL/tools/trex/trex_astf_profile.py
index 63831e4bf4..df0cbccce2 100644
--- a/GPL/tools/trex/trex_astf_profile.py
+++ b/GPL/tools/trex/trex_astf_profile.py
@@ -24,7 +24,7 @@ import sys
import time
sys.path.insert(
- 0, u"/opt/trex-core-2.82/scripts/automation/trex_control_plane/interactive/"
+ 0, u"/opt/trex-core-2.88/scripts/automation/trex_control_plane/interactive/"
)
from trex.astf.api import *
diff --git a/GPL/tools/trex/trex_astf_stop.py b/GPL/tools/trex/trex_astf_stop.py
index d9784e661a..7e4d5ceeee 100644
--- a/GPL/tools/trex/trex_astf_stop.py
+++ b/GPL/tools/trex/trex_astf_stop.py
@@ -34,7 +34,7 @@ import sys
from collections import OrderedDict # Needed to parse xstats representation.
sys.path.insert(
- 0, u"/opt/trex-core-2.82/scripts/automation/trex_control_plane/interactive/"
+ 0, u"/opt/trex-core-2.88/scripts/automation/trex_control_plane/interactive/"
)
from trex.astf.api import *
diff --git a/GPL/tools/trex/trex_stl_assert.py b/GPL/tools/trex/trex_stl_assert.py
index 7c7e9215b8..ca92ad27e4 100644
--- a/GPL/tools/trex/trex_stl_assert.py
+++ b/GPL/tools/trex/trex_stl_assert.py
@@ -28,7 +28,7 @@ Functionality:
import sys
sys.path.insert(
- 0, u"/opt/trex-core-2.82/scripts/automation/trex_control_plane/interactive/"
+ 0, u"/opt/trex-core-2.88/scripts/automation/trex_control_plane/interactive/"
)
from trex.stl.api import *
diff --git a/GPL/tools/trex/trex_stl_profile.py b/GPL/tools/trex/trex_stl_profile.py
index ea001b58b6..e0a7b6cdbd 100644
--- a/GPL/tools/trex/trex_stl_profile.py
+++ b/GPL/tools/trex/trex_stl_profile.py
@@ -24,7 +24,7 @@ import sys
import time
sys.path.insert(
- 0, u"/opt/trex-core-2.82/scripts/automation/trex_control_plane/interactive/"
+ 0, u"/opt/trex-core-2.88/scripts/automation/trex_control_plane/interactive/"
)
from trex.stl.api import *
diff --git a/GPL/tools/trex/trex_stl_stop.py b/GPL/tools/trex/trex_stl_stop.py
index f07479da8a..2c34618bd1 100644
--- a/GPL/tools/trex/trex_stl_stop.py
+++ b/GPL/tools/trex/trex_stl_stop.py
@@ -34,7 +34,7 @@ import sys
from collections import OrderedDict # Needed to parse xstats representation.
sys.path.insert(
- 0, u"/opt/trex-core-2.82/scripts/automation/trex_control_plane/interactive/"
+ 0, u"/opt/trex-core-2.88/scripts/automation/trex_control_plane/interactive/"
)
from trex.stl.api import *
diff --git a/GPL/traffic_profiles/trex/profile_trex_astf_base_class.py b/GPL/traffic_profiles/trex/profile_trex_astf_base_class.py
index 913a44754c..f1d291105f 100644
--- a/GPL/traffic_profiles/trex/profile_trex_astf_base_class.py
+++ b/GPL/traffic_profiles/trex/profile_trex_astf_base_class.py
@@ -123,7 +123,7 @@ class TrafficProfileBaseClass:
"""
self.framesize = kwargs[u"framesize"]
self._pcap_dir = kwargs.get(
- u"pcap_dir",u"/opt/trex-core-2.82/scripts/avl"
+ u"pcap_dir",u"/opt/trex-core-2.88/scripts/avl"
)
return self.create_profile()
diff --git a/VPP_DEVICE_IMAGE_CENTOS b/VPP_DEVICE_IMAGE_CENTOS
deleted file mode 100644
index 4b3e92939d..0000000000
--- a/VPP_DEVICE_IMAGE_CENTOS
+++ /dev/null
@@ -1 +0,0 @@
-snergster/vpp-centos:latest
diff --git a/VPP_DEVICE_IMAGE_UBUNTU b/VPP_DEVICE_IMAGE_UBUNTU
index 3f47e59a88..e5b17e99b1 100644
--- a/VPP_DEVICE_IMAGE_UBUNTU
+++ b/VPP_DEVICE_IMAGE_UBUNTU
@@ -1 +1 @@
-snergster/csit-sut:latest
+csit_sut-ubuntu2004:local \ No newline at end of file
diff --git a/VPP_DEVICE_IMAGE_UBUNTU_ARM b/VPP_DEVICE_IMAGE_UBUNTU_ARM
index 46a5db3578..4dabdacb85 100644
--- a/VPP_DEVICE_IMAGE_UBUNTU_ARM
+++ b/VPP_DEVICE_IMAGE_UBUNTU_ARM
@@ -1 +1 @@
-snergster/csit-arm-sut:latest
+csit_sut-ubuntu2004:local
diff --git a/VPP_STABLE_VER_CENTOS b/VPP_STABLE_VER_CENTOS
index 0d7d1da1aa..344d23a945 100644
--- a/VPP_STABLE_VER_CENTOS
+++ b/VPP_STABLE_VER_CENTOS
@@ -1 +1 @@
-20.09.0-60~gd1598d427~b80
+20.09.0-64~g4f7b92f0a~b84
diff --git a/VPP_STABLE_VER_UBUNTU_BIONIC b/VPP_STABLE_VER_UBUNTU_BIONIC
index 01776fa79e..2be5d3e7ea 100644
--- a/VPP_STABLE_VER_UBUNTU_BIONIC
+++ b/VPP_STABLE_VER_UBUNTU_BIONIC
@@ -1 +1 @@
-20.09.0-60~gd1598d427
+20.09.0-64~g4f7b92f0a
diff --git a/requirements.txt b/requirements.txt
index 8c0928206e..b467d6f7ab 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -27,13 +27,13 @@ robotframework==3.1.2
scapy==2.4.3
scp==0.13.2
-# Bootstraping dependencies
-ansible==2.7.8
+# IaaC dependencies
+ansible==2.10.7
# PLRsearch dependencies
dill==0.2.8.2
numpy==1.17.3
-scipy==1.1.0
+scipy==1.5.4
# PAL/DOC dependencies
hdrhistogram==0.6.1
diff --git a/resources/libraries/bash/entry/per_patch_device.sh b/resources/libraries/bash/entry/per_patch_device.sh
index 01e4de0089..efa925eda0 100644
--- a/resources/libraries/bash/entry/per_patch_device.sh
+++ b/resources/libraries/bash/entry/per_patch_device.sh
@@ -1,6 +1,6 @@
#!/usr/bin/env bash
-# Copyright (c) 2020 Cisco and/or its affiliates.
+# Copyright (c) 2021 Cisco and/or its affiliates.
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at:
@@ -57,5 +57,4 @@ select_vpp_device_tags || die
compose_pybot_arguments || die
run_pybot || die
move_archives || die
-archive_test_results "csit_current" || die
die_on_pybot_error || die
diff --git a/resources/libraries/bash/function/ansible.sh b/resources/libraries/bash/function/ansible.sh
index 1263412dd5..f522faa93b 100644
--- a/resources/libraries/bash/function/ansible.sh
+++ b/resources/libraries/bash/function/ansible.sh
@@ -1,6 +1,6 @@
#!/usr/bin/env bash
-# Copyright (c) 2020 Cisco and/or its affiliates.
+# Copyright (c) 2021 Cisco and/or its affiliates.
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at:
@@ -34,9 +34,10 @@ function ansible_adhoc () {
die "Failed to read hosts from working topology!"
}
pushd "${TOOLS_DIR}"/testbed-setup/ansible || die "Pushd failed!"
- ANSIBLE_STDOUT_CALLBACK=yaml \
- ANSIBLE_PIPELINING=true \
- ansible \
+ export ANSIBLE_HOST_KEY_CHECKING=False
+ export ANSIBLE_STDOUT_CALLBACK=yaml
+ export ANSIBLE_PIPELINING=true
+ ansible-playbook \
--vault-password-file=vault_pass \
--extra-vars '@vault.yml' \
--inventory inventories/lf_inventory/hosts site.yaml \
@@ -65,9 +66,10 @@ function ansible_playbook () {
die "Failed to read hosts from working topology!"
}
pushd "${TOOLS_DIR}"/testbed-setup/ansible || die "Pushd failed!"
- ANSIBLE_STDOUT_CALLBACK=yaml \
- ANSIBLE_PIPELINING=true \
- ansible-playbook \
+ export ANSIBLE_HOST_KEY_CHECKING=False
+ export ANSIBLE_STDOUT_CALLBACK=yaml
+ export ANSIBLE_PIPELINING=true
+ ansible-playbook \
--vault-password-file=vault_pass \
--extra-vars '@vault.yml' \
--inventory inventories/lf_inventory/hosts site.yaml \
diff --git a/resources/libraries/python/Constants.py b/resources/libraries/python/Constants.py
index faf861d89d..c21c5b8cc5 100644
--- a/resources/libraries/python/Constants.py
+++ b/resources/libraries/python/Constants.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2020 Cisco and/or its affiliates.
+# Copyright (c) 2021 Cisco and/or its affiliates.
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at:
@@ -179,13 +179,13 @@ class Constants:
QEMU_VM_DPDK = u"/opt/dpdk-20.02"
# Docker container SUT image
- DOCKER_SUT_IMAGE_UBUNTU = u"snergster/csit-sut:latest"
+ DOCKER_SUT_IMAGE_UBUNTU = u"csit_sut-ubuntu2004:local"
# Docker container arm SUT image
- DOCKER_SUT_IMAGE_UBUNTU_ARM = u"snergster/csit-arm-sut:latest"
+ DOCKER_SUT_IMAGE_UBUNTU_ARM = u"csit_sut-ubuntu2004:local"
# TRex install directory
- TREX_INSTALL_DIR = u"/opt/trex-core-2.82"
+ TREX_INSTALL_DIR = u"/opt/trex-core-2.88"
# TODO: Find the right way how to use it in trex profiles
# TRex pcap files directory
diff --git a/resources/libraries/python/ContainerUtils.py b/resources/libraries/python/ContainerUtils.py
index 77e139f80f..2810623e24 100644
--- a/resources/libraries/python/ContainerUtils.py
+++ b/resources/libraries/python/ContainerUtils.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2020 Cisco and/or its affiliates.
+# Copyright (c) 2021 Cisco and/or its affiliates.
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at:
@@ -778,7 +778,7 @@ class LXC(ContainerEngine):
else u"amd64"
image = self.container.image if self.container.image \
- else f"-d ubuntu -r bionic -a {target_arch}"
+ else f"-d ubuntu -r focal -a {target_arch}"
cmd = f"lxc-create -t download --name {self.container.name} " \
f"-- {image} --no-validate"
@@ -1000,13 +1000,13 @@ class Docker(ContainerEngine):
else Constants.DOCKER_SUT_IMAGE_UBUNTU
setattr(self.container, u"image", img)
- cmd = f"docker pull {self.container.image}"
-
- ret, _, _ = self.container.ssh.exec_command_sudo(cmd, timeout=1800)
- if int(ret) != 0:
- raise RuntimeError(
- f"Failed to create container {self.container.name}."
- )
+ if "/" in self.container.image:
+ cmd = f"docker pull {self.container.image}"
+ ret, _, _ = self.container.ssh.exec_command_sudo(cmd, timeout=1800)
+ if int(ret) != 0:
+ raise RuntimeError(
+ f"Failed to create container {self.container.name}."
+ )
if self.container.cpuset_cpus:
self._configure_cgroup(u"docker")
diff --git a/resources/libraries/python/PapiExecutor.py b/resources/libraries/python/PapiExecutor.py
index 46fe5d583b..f4d01704b3 100644
--- a/resources/libraries/python/PapiExecutor.py
+++ b/resources/libraries/python/PapiExecutor.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2020 Cisco and/or its affiliates.
+# Copyright (c) 2021 Cisco and/or its affiliates.
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at:
@@ -23,6 +23,8 @@ import subprocess
import sys
import tempfile
import time
+from collections import UserDict
+
from pprint import pformat
from robot.api import logger
@@ -63,11 +65,11 @@ def dictize(obj):
"""
if not hasattr(obj, u"_asdict"):
return obj
- ret = obj._asdict()
- old_get = ret.__getitem__
+ overriden = UserDict(obj._asdict())
+ old_get = overriden.__getitem__
new_get = lambda self, key: dictize(old_get(self, key))
- ret.__getitem__ = new_get
- return ret
+ overriden.__getitem__ = new_get
+ return overriden
class PapiSocketExecutor:
diff --git a/resources/libraries/python/QemuUtils.py b/resources/libraries/python/QemuUtils.py
index b29e19c035..b236090464 100644
--- a/resources/libraries/python/QemuUtils.py
+++ b/resources/libraries/python/QemuUtils.py
@@ -138,7 +138,6 @@ class QemuUtils:
)
self._params.add_with_value(u"m", f"{self._opt.get(u'mem')}M")
self._params.add_with_value(u"numa", u"node,memdev=mem")
- self._params.add_with_value(u"balloon", u"none")
def add_net_user(self):
"""Set managment port forwarding."""
diff --git a/resources/tools/presentation/conf.py b/resources/tools/presentation/conf.py
index 1bff7718ae..716e52dd4d 100644
--- a/resources/tools/presentation/conf.py
+++ b/resources/tools/presentation/conf.py
@@ -163,7 +163,7 @@ html_theme_options = {
# Add any paths that contain custom static files (such as style sheets) here,
# relative to this directory. They are copied after the builtin static files,
# so a file named "default.css" will overwrite the builtin "default.css".
-html_theme_path = [u'env/lib/python3.6/site-packages/sphinx_rtd_theme']
+html_theme_path = [u'env/lib/python3.8/site-packages/sphinx_rtd_theme']
# html_static_path = ['_build/_static']
html_static_path = [u'_tmp/src/_static']
diff --git a/resources/tools/presentation/conf_cpta/conf.py b/resources/tools/presentation/conf_cpta/conf.py
index 50531d94da..53af4ed68a 100644
--- a/resources/tools/presentation/conf_cpta/conf.py
+++ b/resources/tools/presentation/conf_cpta/conf.py
@@ -91,7 +91,7 @@ html_theme = u'sphinx_rtd_theme'
# Add any paths that contain custom static files (such as style sheets) here,
# relative to this directory. They are copied after the builtin static files,
# so a file named "default.css" will overwrite the builtin "default.css".
-html_theme_path = [u'env/lib/python3.6/site-packages/sphinx_rtd_theme']
+html_theme_path = [u'env/lib/python3.8/site-packages/sphinx_rtd_theme']
# html_static_path = ['_build/_static']
html_static_path = [u'../_tmp/src/_static']
diff --git a/resources/tools/presentation/input_data_files.py b/resources/tools/presentation/input_data_files.py
index 8b941f2f94..d4cc8c64e6 100644
--- a/resources/tools/presentation/input_data_files.py
+++ b/resources/tools/presentation/input_data_files.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2020 Cisco and/or its affiliates.
+# Copyright (c) 2021 Cisco and/or its affiliates.
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at:
@@ -21,7 +21,7 @@ import gzip
from os import rename, mkdir
from os.path import join
-from http.client import responses
+from http.client import responses, HTTPException
from zipfile import ZipFile, is_zipfile, BadZipfile
import requests
@@ -30,6 +30,8 @@ from requests.adapters import HTTPAdapter, Retry
from requests.exceptions import RequestException
from requests import codes
+from urllib3.exceptions import HTTPError
+
from pal_errors import PresentationError
@@ -42,15 +44,19 @@ SEPARATOR = u"__"
REGEX_RELEASE = re.compile(r'(\D*)(\d{4}|master)(\D*)')
-def _download_file(url, file_name, arch=False):
+def _download_file(url, file_name, arch=False, verify=True, repeat=1):
"""Download a file with input data.
:param url: URL to the file to download.
:param file_name: Name of file to download.
- :param arch: If True, also .gz file is downloaded
+ :param arch: If True, also .gz file is downloaded.
+ :param verify: If true, verify the certificate.
+ :param repeat: The number of attempts to download the file.
:type url: str
:type file_name: str
:type arch: bool
+ :type verify: bool
+ :type repeat: int
:returns: True if the download was successful, otherwise False.
:rtype: bool
"""
@@ -86,56 +92,62 @@ def _download_file(url, file_name, arch=False):
return session
success = False
- session = None
- try:
- logging.info(f" Connecting to {url} ...")
- session = requests_retry_session()
- response = session.get(url, stream=True)
- code = response.status_code
- logging.info(f" {code}: {responses[code]}")
-
- if code != codes[u"OK"]:
- if session:
- session.close()
- url = url.replace(u"_info", u"")
+ while repeat:
+ repeat -= 1
+ session = None
+ try:
logging.info(f" Connecting to {url} ...")
session = requests_retry_session()
- response = session.get(url, stream=True)
+ response = session.get(url, stream=True, verify=verify)
code = response.status_code
logging.info(f" {code}: {responses[code]}")
- if code != codes[u"OK"]:
- return False, file_name
- file_name = file_name.replace(u"_info", u"")
-
- dst_file_name = file_name.replace(u".gz", u"")
- logging.info(f" Downloading the file {url} to {dst_file_name} ...")
- with open(dst_file_name, u"wb") as file_handle:
- for chunk in response.iter_content(chunk_size=CHUNK_SIZE):
- if chunk:
- file_handle.write(chunk)
- if arch and u".gz" in file_name:
+ if code != codes[u"OK"]:
+ if session:
+ session.close()
+ url = url.replace(u"_info", u"")
+ logging.info(f" Connecting to {url} ...")
+ session = requests_retry_session()
+ response = session.get(url, stream=True, verify=verify)
+ code = response.status_code
+ logging.info(f" {code}: {responses[code]}")
+ if code != codes[u"OK"]:
+ return False, file_name
+ file_name = file_name.replace(u"_info", u"")
+
+ dst_file_name = file_name.replace(u".gz", u"")
+ logging.info(f" Downloading the file {url} to {dst_file_name}")
+ with open(dst_file_name, u"wb") as file_handle:
+ for chunk in response.iter_content(chunk_size=CHUNK_SIZE):
+ if chunk:
+ file_handle.write(chunk)
+
+ if arch and u".gz" in file_name:
+ if session:
+ session.close()
+ logging.info(f" Downloading the file {url} to {file_name}")
+ session = requests_retry_session()
+ response = session.get(url, stream=True, verify=verify)
+ if response.status_code == codes[u"OK"]:
+ with open(file_name, u"wb") as file_handle:
+ file_handle.write(response.raw.read())
+ else:
+ logging.error(
+ f"Not possible to download the file "
+ f"{url} to {file_name}"
+ )
+
+ success = True
+ repeat = 0
+ except (HTTPException, HTTPError) as err:
+ logging.error(f"Connection broken:\n{repr(err)}")
+ except RequestException as err:
+ logging.error(f"HTTP Request exception:\n{repr(err)}")
+ except (IOError, ValueError, KeyError) as err:
+ logging.error(f"Download failed.\n{repr(err)}")
+ finally:
if session:
session.close()
- logging.info(f" Downloading the file {url} to {file_name} ...")
- session = requests_retry_session()
- response = session.get(url, stream=True)
- if response.status_code == codes[u"OK"]:
- with open(file_name, u"wb") as file_handle:
- file_handle.write(response.raw.read())
- else:
- logging.error(
- f"Not possible to download the file {url} to {file_name}"
- )
-
- success = True
- except RequestException as err:
- logging.error(f"HTTP Request exception:\n{repr(err)}")
- except (IOError, ValueError, KeyError) as err:
- logging.error(f"Download failed.\n{repr(err)}")
- finally:
- if session:
- session.close()
logging.info(u" Download finished.")
return success, file_name
@@ -200,27 +212,44 @@ def download_and_unzip_data_file(spec, job, build, pid):
:rtype: bool
"""
- # Try to download .gz from logs.fd.io
+ success = False
file_name = spec.input[u"file-name"]
- url = u"{0}/{1}".format(
- spec.environment[u'urls'][u'URL[NEXUS,LOG]'],
- spec.input[u'download-path'].format(
- job=job, build=build[u'build'], filename=file_name
- )
- )
new_name = join(
spec.environment[u"paths"][u"DIR[WORKING,DATA]"],
f"{job}{SEPARATOR}{build[u'build']}{SEPARATOR}{file_name}"
)
-
- logging.info(f"Trying to download {url}")
-
arch = bool(spec.configuration.get(u"archive-inputs", True))
- success, downloaded_name = _download_file(url, new_name, arch=arch)
+ downloaded_name = u""
+
+ # Try to download .gz from s3_storage
+ for path in spec.input[u'download-path']:
+ url = u"{0}/{1}".format(
+ spec.environment[u'urls'][u'URL[S3_STORAGE,LOG]'],
+ path.format(job=job, build=build[u'build'], filename=file_name)
+ )
+ logging.info(f"Trying to download {url}")
+ success, downloaded_name = _download_file(
+ url, new_name, arch=arch, verify=False, repeat=3
+ )
+ if success:
+ break
if not success:
+ # Try to download .gz from logs.fd.io
+ for path in spec.input[u'download-path']:
+ url = u"{0}/{1}".format(
+ spec.environment[u'urls'][u'URL[NEXUS,LOG]'],
+ path.format(job=job, build=build[u'build'], filename=file_name)
+ )
+ logging.info(f"Trying to download {url}")
+ success, downloaded_name = _download_file(
+ url, new_name, arch=arch, verify=True, repeat=3
+ )
+ if success:
+ break
+ if not success:
# Try to download .gz or .zip from docs.fd.io
file_name = (spec.input[u"file-name"], spec.input[u"zip-file-name"])
release = re.search(REGEX_RELEASE, job).group(2)
@@ -254,14 +283,11 @@ def download_and_unzip_data_file(spec, job, build, pid):
break
if not success:
-
# Try to download .zip from jenkins.fd.io
file_name = spec.input[u"zip-file-name"]
download_path = spec.input[u"zip-download-path"]
if job.startswith(u"csit-"):
url = spec.environment[u"urls"][u"URL[JENKINS,CSIT]"]
- elif job.startswith(u"hc2vpp-"):
- url = spec.environment[u"urls"][u"URL[JENKINS,HC]"]
else:
raise PresentationError(f"No url defined for the job {job}.")
@@ -273,9 +299,7 @@ def download_and_unzip_data_file(spec, job, build, pid):
spec.environment[u"paths"][u"DIR[WORKING,DATA]"],
f"{job}{SEPARATOR}{build[u'build']}{SEPARATOR}{file_name}"
)
-
logging.info(f"Downloading {url}")
-
success, downloaded_name = _download_file(url, new_name)
if success and downloaded_name.endswith(u".zip"):
diff --git a/resources/tools/presentation/requirements.txt b/resources/tools/presentation/requirements.txt
index 0063926a8c..62af5202ab 100644
--- a/resources/tools/presentation/requirements.txt
+++ b/resources/tools/presentation/requirements.txt
@@ -1,6 +1,6 @@
-Sphinx==2.2.1
-sphinx-rtd-theme==0.4.0
-sphinxcontrib-programoutput==0.15
+Sphinx==3.5.4
+sphinx-rtd-theme==0.5.2
+sphinxcontrib-programoutput==0.17
robotframework==3.1.2
PyYAML==5.1
numpy==1.17.3
diff --git a/resources/tools/presentation/run_cpta.sh b/resources/tools/presentation/run_cpta.sh
index 1aa5703162..043ac17834 100755
--- a/resources/tools/presentation/run_cpta.sh
+++ b/resources/tools/presentation/run_cpta.sh
@@ -22,6 +22,8 @@ source ${DIR[WORKING]}/env/bin/activate
# Install python dependencies:
pip3 install -r requirements.txt
+pip3 freeze requirements.txt
+
export PYTHONPATH=`pwd`:`pwd`/../../../:`pwd`/../../libraries/python
STATUS=$(python pal.py \
diff --git a/resources/tools/presentation/run_report.sh b/resources/tools/presentation/run_report.sh
index 52757efbf7..f006669b80 100755
--- a/resources/tools/presentation/run_report.sh
+++ b/resources/tools/presentation/run_report.sh
@@ -9,7 +9,6 @@ typeset -A CFG
typeset -A DIR
DIR[WORKING]=_tmp
-CFG[BLD_LATEX]=1
# Install system dependencies
sudo apt-get -y update
diff --git a/resources/tools/presentation/specification.yaml b/resources/tools/presentation/specification.yaml
index 68777bbaed..6984d052e0 100644
--- a/resources/tools/presentation/specification.yaml
+++ b/resources/tools/presentation/specification.yaml
@@ -98,7 +98,7 @@
urls:
URL[JENKINS,CSIT]: "https://jenkins.fd.io/view/csit/job"
- URL[JENKINS,HC]: "https://jenkins.fd.io/view/hc2vpp/job"
+ URL[S3_STORAGE,LOG]: "https://logs.nginx.service.consul/vex-yul-rot-jenkins-1"
URL[NEXUS,LOG]: "https://logs.fd.io/production/vex-yul-rot-jenkins-1"
URL[NEXUS,DOC]: "https://docs.fd.io/csit"
DIR[NEXUS,DOC]: "report/_static/archive"
@@ -2559,7 +2559,9 @@
arch-file-format:
- ".gz"
- ".zip"
- download-path: "{job}/{build}/archives/{filename}"
+ download-path:
+ - "{job}/{build}/{filename}"
+ - "{job}/{build}/archives/{filename}"
extract: "output.xml"
zip-file-name: "robot-plugin.zip"
@@ -3323,8 +3325,8 @@
format:
html:
- full
- pdf:
- - minimal
+# pdf:
+# - minimal
################################################################################
### T A B L E S ###
diff --git a/resources/tools/presentation/specification_CPTA.yaml b/resources/tools/presentation/specification_CPTA.yaml
index 41d5987201..82b6ed6362 100644
--- a/resources/tools/presentation/specification_CPTA.yaml
+++ b/resources/tools/presentation/specification_CPTA.yaml
@@ -1,4 +1,4 @@
-# Copyright (c) 2020 Cisco and/or its affiliates.
+# Copyright (c) 2021 Cisco and/or its affiliates.
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at:
@@ -44,6 +44,7 @@
urls:
URL[JENKINS,CSIT]: "https://jenkins.fd.io/view/csit/job"
+ URL[S3_STORAGE,LOG]: "https://logs.nginx.service.consul/vex-yul-rot-jenkins-1"
URL[NEXUS,LOG]: "https://logs.fd.io/production/vex-yul-rot-jenkins-1"
URL[NEXUS,DOC]: "https://docs.fd.io/csit"
DIR[NEXUS,DOC]: "trending/_static/archive"
@@ -341,7 +342,9 @@
general:
file-name: "output_info.xml.gz"
file-format: ".gz"
- download-path: "{job}/{build}/archives/{filename}"
+ download-path:
+ - "{job}/{build}/archives/{filename}"
+ - "{job}/{build}/{filename}"
extract: "output.xml"
# Number of days from now to the past. Only files generated in this
# time period are used.
diff --git a/resources/tools/presentation/specification_local.yaml b/resources/tools/presentation/specification_local.yaml
index 393228c1a5..1971919532 100644
--- a/resources/tools/presentation/specification_local.yaml
+++ b/resources/tools/presentation/specification_local.yaml
@@ -892,7 +892,9 @@
arch-file-format:
- ".gz"
- ".zip"
- download-path: "{job}/{build}/archives/{filename}"
+ download-path:
+ - "{job}/{build}/archives/{filename}"
+ - "{job}/{build}/{filename}"
extract: "output.xml"
# Specifies the secondary source - Jenkins, it is used if the source file
diff --git a/resources/tools/testbed-setup/ansible/dev.yaml b/resources/tools/testbed-setup/ansible/dev.yaml
index 48ec6b21c5..6f6d2a7cb5 100644
--- a/resources/tools/testbed-setup/ansible/dev.yaml
+++ b/resources/tools/testbed-setup/ansible/dev.yaml
@@ -5,6 +5,12 @@
remote_user: testuser
become: yes
become_user: root
+ gather_facts: false
+ pre_tasks:
+ - name: Gathering Facts
+ gather_facts:
+ tags:
+ - always
roles:
- role: user_add
tags: user_add
diff --git a/resources/tools/testbed-setup/ansible/inventories/lf_inventory/host_vars/10.30.51.13.yaml b/resources/tools/testbed-setup/ansible/inventories/lf_inventory/host_vars/10.30.51.13.yaml
deleted file mode 100644
index 443f7255f8..0000000000
--- a/resources/tools/testbed-setup/ansible/inventories/lf_inventory/host_vars/10.30.51.13.yaml
+++ /dev/null
@@ -1,32 +0,0 @@
----
-# file: host_vars/10.30.51.13.yaml
-
-hostname: "s44-nomad"
-
-# User management.
-users:
- - username: localadmin
- groups: [adm, sudo]
- password: "$6$FIsbVDQR$5D0wgufOd2FtnmOiRNsGlgg6Loh.0x3dWSj72DSQnqisSyE9DROfgSgA6s0yxDwz4Jd5SRTXiTKuRYuSQ5POI1"
- ssh_key:
- - "ssh-rsa AAAAB3NzaC1yc2EAAAABJQAAAQEAgObJFDIMmPwQhhkjAynvlbwpM5yeSewyaE7vTLaFf4uFz4vmsE2hFf6B2xXHUGLVwoVfk91UeK7LOGrdDpoDDHzvPZXj5NmZI+WiWax5y2pQZNkcSZws0ENCeEc4hPwc4veJ1JmhokF4Bsmu14HyFMaFUhM8897jtJwsh+9fLA/no0iPGaQqEtRUQhkV+P4jCEPoY0qdRZAzVw/rY4EGAMhsJe3EJmyj63OfrrkG3+hvSLFo5pDxHQr3pZd/c6ukI7xMef48PosAvGCm3oxzb/Gu9PZIGuHLczY+tCnzCkY7MO7E+IWgjXrUAfYwSWz8XmFmA9LLe26DT5jkcK8hGQ== pmikus@cisco.com"
- - username: testuser
- groups: [adm, sudo]
- password: "$6$zpBUdQ4q$P2zKclumvCndWujgP/qQ8eMk3YZk7ESAom04Fqp26hJH2jWkMXEX..jqxzMdDLJKiDaDHIaSkQMVjHzd3cRLs1"
- ssh_key:
- - "ssh-rsa AAAAB3NzaC1yc2EAAAABJQAAAQEAgObJFDIMmPwQhhkjAynvlbwpM5yeSewyaE7vTLaFf4uFz4vmsE2hFf6B2xXHUGLVwoVfk91UeK7LOGrdDpoDDHzvPZXj5NmZI+WiWax5y2pQZNkcSZws0ENCeEc4hPwc4veJ1JmhokF4Bsmu14HyFMaFUhM8897jtJwsh+9fLA/no0iPGaQqEtRUQhkV+P4jCEPoY0qdRZAzVw/rY4EGAMhsJe3EJmyj63OfrrkG3+hvSLFo5pDxHQr3pZd/c6ukI7xMef48PosAvGCm3oxzb/Gu9PZIGuHLczY+tCnzCkY7MO7E+IWgjXrUAfYwSWz8XmFmA9LLe26DT5jkcK8hGQ== pmikus@cisco.com"
-
-# Nomad settings.
-nomad_certificates:
- - src: "{{ vault_nomad_v2_ca_file }}"
- dest: "{{ nomad_ca_file }}"
- - src: "{{ vault_nomad_v2_cert_file }}"
- dest: "{{ nomad_cert_file }}"
- - src: "{{ vault_nomad_v2_key_file }}"
- dest: "{{ nomad_key_file }}"
-nomad_datacenter: "yul1"
-nomad_encrypt: "Y4T+5JGx1C3l2NFBBvkTWQ=="
-nomad_name: "{{ hostname }}-{{ ansible_architecture }}"
-nomad_node_role: "server"
-nomad_retry_servers: [ "10.30.51.32", "10.30.51.33", "10.30.51.14" ]
-nomad_servers: [ "10.30.51.13:4647" ]
diff --git a/resources/tools/testbed-setup/ansible/inventories/lf_inventory/host_vars/10.30.51.14.yaml b/resources/tools/testbed-setup/ansible/inventories/lf_inventory/host_vars/10.30.51.14.yaml
deleted file mode 100644
index 4da927ebd4..0000000000
--- a/resources/tools/testbed-setup/ansible/inventories/lf_inventory/host_vars/10.30.51.14.yaml
+++ /dev/null
@@ -1,32 +0,0 @@
----
-# file: host_vars/10.30.51.14.yaml
-
-hostname: "s45-nomad"
-
-# User management.
-users:
- - username: localadmin
- groups: [adm, sudo]
- password: "$6$FIsbVDQR$5D0wgufOd2FtnmOiRNsGlgg6Loh.0x3dWSj72DSQnqisSyE9DROfgSgA6s0yxDwz4Jd5SRTXiTKuRYuSQ5POI1"
- ssh_key:
- - "ssh-rsa AAAAB3NzaC1yc2EAAAABJQAAAQEAgObJFDIMmPwQhhkjAynvlbwpM5yeSewyaE7vTLaFf4uFz4vmsE2hFf6B2xXHUGLVwoVfk91UeK7LOGrdDpoDDHzvPZXj5NmZI+WiWax5y2pQZNkcSZws0ENCeEc4hPwc4veJ1JmhokF4Bsmu14HyFMaFUhM8897jtJwsh+9fLA/no0iPGaQqEtRUQhkV+P4jCEPoY0qdRZAzVw/rY4EGAMhsJe3EJmyj63OfrrkG3+hvSLFo5pDxHQr3pZd/c6ukI7xMef48PosAvGCm3oxzb/Gu9PZIGuHLczY+tCnzCkY7MO7E+IWgjXrUAfYwSWz8XmFmA9LLe26DT5jkcK8hGQ== pmikus@cisco.com"
- - username: testuser
- groups: [adm, sudo]
- password: "$6$zpBUdQ4q$P2zKclumvCndWujgP/qQ8eMk3YZk7ESAom04Fqp26hJH2jWkMXEX..jqxzMdDLJKiDaDHIaSkQMVjHzd3cRLs1"
- ssh_key:
- - "ssh-rsa AAAAB3NzaC1yc2EAAAABJQAAAQEAgObJFDIMmPwQhhkjAynvlbwpM5yeSewyaE7vTLaFf4uFz4vmsE2hFf6B2xXHUGLVwoVfk91UeK7LOGrdDpoDDHzvPZXj5NmZI+WiWax5y2pQZNkcSZws0ENCeEc4hPwc4veJ1JmhokF4Bsmu14HyFMaFUhM8897jtJwsh+9fLA/no0iPGaQqEtRUQhkV+P4jCEPoY0qdRZAzVw/rY4EGAMhsJe3EJmyj63OfrrkG3+hvSLFo5pDxHQr3pZd/c6ukI7xMef48PosAvGCm3oxzb/Gu9PZIGuHLczY+tCnzCkY7MO7E+IWgjXrUAfYwSWz8XmFmA9LLe26DT5jkcK8hGQ== pmikus@cisco.com"
-
-# Nomad settings.
-nomad_certificates:
- - src: "{{ vault_nomad_v2_ca_file }}"
- dest: "{{ nomad_ca_file }}"
- - src: "{{ vault_nomad_v2_cert_file }}"
- dest: "{{ nomad_cert_file }}"
- - src: "{{ vault_nomad_v2_key_file }}"
- dest: "{{ nomad_key_file }}"
-nomad_datacenter: "yul1"
-nomad_encrypt: "Y4T+5JGx1C3l2NFBBvkTWQ=="
-nomad_name: "{{ hostname }}-{{ ansible_architecture }}"
-nomad_node_role: "server"
-nomad_retry_servers: [ "10.30.51.32", "10.30.51.33", "10.30.51.13" ]
-nomad_servers: [ "10.30.51.13:4647" ]
diff --git a/resources/tools/testbed-setup/ansible/inventories/lf_inventory/host_vars/10.30.51.28.yaml b/resources/tools/testbed-setup/ansible/inventories/lf_inventory/host_vars/10.30.51.28.yaml
index 617304aef6..bf1da2a759 100644
--- a/resources/tools/testbed-setup/ansible/inventories/lf_inventory/host_vars/10.30.51.28.yaml
+++ b/resources/tools/testbed-setup/ansible/inventories/lf_inventory/host_vars/10.30.51.28.yaml
@@ -33,6 +33,7 @@ nomad_options:
driver.raw_exec.enable: 1
docker.cleanup.image: false
docker.privileged.enabled: true
+ docker.volumes.enabled: true
driver.whitelist: "docker,raw_exec,exec"
fingerprint.network.disallow_link_local: true
nomad_servers: [ "10.30.51.32:4647", "10.30.51.33:4647" ]
@@ -53,4 +54,11 @@ consul_node_role: "client"
consul_retry_servers:
- "10.30.51.30"
- "10.30.51.32"
- - "10.30.51.33" \ No newline at end of file
+ - "10.30.51.33"
+
+# Docker daemon settings.
+docker_daemon:
+ # https://docs.docker.com/engine/reference/commandline/dockerd/#daemon-configuration-file
+ dns: [ "172.17.0.1" ]
+ dns-opts: []
+ dns-search: [ "{{ansible_hostname}}" ] \ No newline at end of file
diff --git a/resources/tools/testbed-setup/ansible/inventories/lf_inventory/host_vars/10.30.51.29.yaml b/resources/tools/testbed-setup/ansible/inventories/lf_inventory/host_vars/10.30.51.29.yaml
index 0cc15e23b7..5b3a1725b3 100644
--- a/resources/tools/testbed-setup/ansible/inventories/lf_inventory/host_vars/10.30.51.29.yaml
+++ b/resources/tools/testbed-setup/ansible/inventories/lf_inventory/host_vars/10.30.51.29.yaml
@@ -33,6 +33,7 @@ nomad_options:
driver.raw_exec.enable: 1
docker.cleanup.image: false
docker.privileged.enabled: true
+ docker.volumes.enabled: true
driver.whitelist: "docker,raw_exec,exec"
nomad_servers: [ "10.30.51.32:4647", "10.30.51.33:4647" ]
@@ -52,4 +53,11 @@ consul_node_role: "client"
consul_retry_servers:
- "10.30.51.30"
- "10.30.51.32"
- - "10.30.51.33" \ No newline at end of file
+ - "10.30.51.33"
+
+# Docker daemon settings.
+docker_daemon:
+ # https://docs.docker.com/engine/reference/commandline/dockerd/#daemon-configuration-file
+ dns: [ "172.17.0.1" ]
+ dns-opts: []
+ dns-search: [ "{{ansible_hostname}}" ] \ No newline at end of file
diff --git a/resources/tools/testbed-setup/ansible/inventories/lf_inventory/host_vars/10.30.51.30.yaml b/resources/tools/testbed-setup/ansible/inventories/lf_inventory/host_vars/10.30.51.30.yaml
index 75f43beba2..13306a74a9 100644
--- a/resources/tools/testbed-setup/ansible/inventories/lf_inventory/host_vars/10.30.51.30.yaml
+++ b/resources/tools/testbed-setup/ansible/inventories/lf_inventory/host_vars/10.30.51.30.yaml
@@ -34,6 +34,7 @@ nomad_options:
driver.raw_exec.enable: 1
docker.cleanup.image: false
docker.privileged.enabled: true
+ docker.volumes.enabled: true
driver.whitelist: "docker,raw_exec,exec"
fingerprint.network.disallow_link_local: true
nomad_retry_servers: [ "10.30.51.32", "10.30.51.33" ]
@@ -54,4 +55,11 @@ consul_node_name: "{{ hostname }}"
consul_node_role: "both"
consul_retry_servers:
- "10.30.51.32"
- - "10.30.51.33" \ No newline at end of file
+ - "10.30.51.33"
+
+# Docker daemon settings.
+docker_daemon:
+ # https://docs.docker.com/engine/reference/commandline/dockerd/#daemon-configuration-file
+ dns: [ "172.17.0.1" ]
+ dns-opts: []
+ dns-search: [ "{{ansible_hostname}}" ] \ No newline at end of file
diff --git a/resources/tools/testbed-setup/ansible/inventories/lf_inventory/host_vars/10.30.51.32.yaml b/resources/tools/testbed-setup/ansible/inventories/lf_inventory/host_vars/10.30.51.32.yaml
index f8662eac43..a52008f023 100644
--- a/resources/tools/testbed-setup/ansible/inventories/lf_inventory/host_vars/10.30.51.32.yaml
+++ b/resources/tools/testbed-setup/ansible/inventories/lf_inventory/host_vars/10.30.51.32.yaml
@@ -34,6 +34,7 @@ nomad_options:
driver.raw_exec.enable: 1
docker.cleanup.image: false
docker.privileged.enabled: true
+ docker.volumes.enabled: true
driver.whitelist: "docker,raw_exec,exec"
nomad_retry_servers: [ "10.30.51.33", "10.30.51.30" ]
nomad_servers: [ "10.30.51.32:4647" ]
@@ -53,4 +54,11 @@ consul_node_name: "{{ hostname }}"
consul_node_role: "both"
consul_retry_servers:
- "10.30.51.30"
- - "10.30.51.33" \ No newline at end of file
+ - "10.30.51.33"
+
+# Docker daemon settings.
+docker_daemon:
+ # https://docs.docker.com/engine/reference/commandline/dockerd/#daemon-configuration-file
+ dns: [ "172.17.0.1" ]
+ dns-opts: []
+ dns-search: [ "{{ansible_hostname}}" ] \ No newline at end of file
diff --git a/resources/tools/testbed-setup/ansible/inventories/lf_inventory/host_vars/10.30.51.33.yaml b/resources/tools/testbed-setup/ansible/inventories/lf_inventory/host_vars/10.30.51.33.yaml
index ae47f9f23a..7ab2f823cb 100644
--- a/resources/tools/testbed-setup/ansible/inventories/lf_inventory/host_vars/10.30.51.33.yaml
+++ b/resources/tools/testbed-setup/ansible/inventories/lf_inventory/host_vars/10.30.51.33.yaml
@@ -34,6 +34,7 @@ nomad_options:
driver.raw_exec.enable: 1
docker.cleanup.image: false
docker.privileged.enabled: true
+ docker.volumes.enabled: true
driver.whitelist: "docker,raw_exec,exec"
nomad_retry_servers: [ "10.30.51.32", "10.30.51.30" ]
nomad_servers: [ "10.30.51.33:4647" ]
@@ -53,4 +54,11 @@ consul_node_name: "{{ hostname }}"
consul_node_role: "both"
consul_retry_servers:
- "10.30.51.30"
- - "10.30.51.32" \ No newline at end of file
+ - "10.30.51.32"
+
+# Docker daemon settings.
+docker_daemon:
+ # https://docs.docker.com/engine/reference/commandline/dockerd/#daemon-configuration-file
+ dns: [ "172.17.0.1" ]
+ dns-opts: []
+ dns-search: [ "{{ansible_hostname}}" ] \ No newline at end of file
diff --git a/resources/tools/testbed-setup/ansible/inventories/lf_inventory/host_vars/10.30.51.34.yaml b/resources/tools/testbed-setup/ansible/inventories/lf_inventory/host_vars/10.30.51.34.yaml
index 50981b8d2e..a45cda8e86 100644
--- a/resources/tools/testbed-setup/ansible/inventories/lf_inventory/host_vars/10.30.51.34.yaml
+++ b/resources/tools/testbed-setup/ansible/inventories/lf_inventory/host_vars/10.30.51.34.yaml
@@ -34,6 +34,7 @@ nomad_options:
driver.raw_exec.enable: 1
docker.cleanup.image: false
docker.privileged.enabled: true
+ docker.volumes.enabled: true
driver.whitelist: "docker,raw_exec,exec"
nomad_retry_servers: [ "10.30.51.32", "10.30.51.33" ]
nomad_servers: [ "10.30.51.33:4647" ]
@@ -54,4 +55,11 @@ consul_node_role: "client"
consul_retry_servers:
- "10.30.51.30"
- "10.30.51.32"
- - "10.30.51.33" \ No newline at end of file
+ - "10.30.51.33"
+
+# Docker daemon settings.
+docker_daemon:
+ # https://docs.docker.com/engine/reference/commandline/dockerd/#daemon-configuration-file
+ dns: [ "172.17.0.1" ]
+ dns-opts: []
+ dns-search: [ "{{ansible_hostname}}" ] \ No newline at end of file
diff --git a/resources/tools/testbed-setup/ansible/inventories/lf_inventory/host_vars/10.30.51.35.yaml b/resources/tools/testbed-setup/ansible/inventories/lf_inventory/host_vars/10.30.51.35.yaml
index 48402542ce..f609a839c4 100644
--- a/resources/tools/testbed-setup/ansible/inventories/lf_inventory/host_vars/10.30.51.35.yaml
+++ b/resources/tools/testbed-setup/ansible/inventories/lf_inventory/host_vars/10.30.51.35.yaml
@@ -34,6 +34,7 @@ nomad_options:
driver.raw_exec.enable: 1
docker.cleanup.image: false
docker.privileged.enabled: true
+ docker.volumes.enabled: true
driver.whitelist: "docker,raw_exec,exec"
nomad_retry_servers: [ "10.30.51.32", "10.30.51.33" ]
nomad_servers: [ "10.30.51.33:4647" ]
@@ -54,4 +55,11 @@ consul_node_role: "client"
consul_retry_servers:
- "10.30.51.30"
- "10.30.51.32"
- - "10.30.51.33" \ No newline at end of file
+ - "10.30.51.33"
+
+# Docker daemon settings.
+docker_daemon:
+ # https://docs.docker.com/engine/reference/commandline/dockerd/#daemon-configuration-file
+ dns: [ "172.17.0.1" ]
+ dns-opts: []
+ dns-search: [ "{{ansible_hostname}}" ] \ No newline at end of file
diff --git a/resources/tools/testbed-setup/ansible/inventories/lf_inventory/host_vars/10.30.51.39.yaml b/resources/tools/testbed-setup/ansible/inventories/lf_inventory/host_vars/10.30.51.39.yaml
index aebc8b3907..da66a5e293 100644
--- a/resources/tools/testbed-setup/ansible/inventories/lf_inventory/host_vars/10.30.51.39.yaml
+++ b/resources/tools/testbed-setup/ansible/inventories/lf_inventory/host_vars/10.30.51.39.yaml
@@ -33,9 +33,10 @@ nomad_name: "{{ hostname }}-{{ ansible_architecture }}"
nomad_node_role: "client"
nomad_node_class: "builder"
nomad_options:
- driver.raw_exec.enable: 0
+ driver.raw_exec.enable: 1
docker.cleanup.image: false
docker.privileged.enabled: true
+ docker.volumes.enabled: true
driver.whitelist: "docker,raw_exec,exec"
nomad_servers: [ "10.30.51.32:4647", "10.30.51.33:4647" ]
nomad_cpu_total_compute: "40000"
@@ -59,4 +60,11 @@ consul_retry_servers:
- "10.30.51.33"
- "10.32.8.14"
- "10.32.8.15"
- - "10.32.8.16" \ No newline at end of file
+ - "10.32.8.16"
+
+# Docker daemon settings.
+docker_daemon:
+ # https://docs.docker.com/engine/reference/commandline/dockerd/#daemon-configuration-file
+ dns: [ "172.17.0.1" ]
+ dns-opts: []
+ dns-search: [ "{{ansible_hostname}}" ] \ No newline at end of file
diff --git a/resources/tools/testbed-setup/ansible/inventories/lf_inventory/host_vars/10.30.51.40.yaml b/resources/tools/testbed-setup/ansible/inventories/lf_inventory/host_vars/10.30.51.40.yaml
index 5f82e294a6..58839c8365 100644
--- a/resources/tools/testbed-setup/ansible/inventories/lf_inventory/host_vars/10.30.51.40.yaml
+++ b/resources/tools/testbed-setup/ansible/inventories/lf_inventory/host_vars/10.30.51.40.yaml
@@ -33,9 +33,10 @@ nomad_name: "{{ hostname }}-{{ ansible_architecture }}"
nomad_node_role: "client"
nomad_node_class: "builder"
nomad_options:
- driver.raw_exec.enable: 0
+ driver.raw_exec.enable: 1
docker.cleanup.image: false
docker.privileged.enabled: true
+ docker.volumes.enabled: true
driver.whitelist: "docker,raw_exec,exec"
nomad_servers: [ "10.30.51.32:4647", "10.30.51.33:4647" ]
nomad_cpu_total_compute: "40000"
@@ -59,4 +60,11 @@ consul_retry_servers:
- "10.30.51.33"
- "10.32.8.14"
- "10.32.8.15"
- - "10.32.8.16" \ No newline at end of file
+ - "10.32.8.16"
+
+# Docker daemon settings.
+docker_daemon:
+ # https://docs.docker.com/engine/reference/commandline/dockerd/#daemon-configuration-file
+ dns: [ "172.17.0.1" ]
+ dns-opts: []
+ dns-search: [ "{{ansible_hostname}}" ] \ No newline at end of file
diff --git a/resources/tools/testbed-setup/ansible/inventories/lf_inventory/host_vars/10.30.51.49.yaml b/resources/tools/testbed-setup/ansible/inventories/lf_inventory/host_vars/10.30.51.49.yaml
index 62ef6da2ce..9e6d17fb8f 100644
--- a/resources/tools/testbed-setup/ansible/inventories/lf_inventory/host_vars/10.30.51.49.yaml
+++ b/resources/tools/testbed-setup/ansible/inventories/lf_inventory/host_vars/10.30.51.49.yaml
@@ -1,7 +1,7 @@
---
# file: host_vars/10.30.51.49.yaml
-hostname: "s19-t33t34-tg1"
+hostname: "s19-t33t211-tg1"
grub:
audit: "0"
hpet: "disable"
@@ -22,8 +22,9 @@ sysctl:
kernel:
watchdog_cpumask: "0,28,56,84"
vm:
- nr_hugepages: 8192
+ nr_hugepages: 16384
max_map_count: 20000
inventory_ipmi_hostname: '10.30.50.46'
cpu_microarchitecture: "skylake"
+docker_tg: true
diff --git a/resources/tools/testbed-setup/ansible/inventories/lf_inventory/host_vars/10.30.51.50.yaml b/resources/tools/testbed-setup/ansible/inventories/lf_inventory/host_vars/10.30.51.50.yaml
index c2a7bd782c..f3b8886a72 100644
--- a/resources/tools/testbed-setup/ansible/inventories/lf_inventory/host_vars/10.30.51.50.yaml
+++ b/resources/tools/testbed-setup/ansible/inventories/lf_inventory/host_vars/10.30.51.50.yaml
@@ -4,7 +4,7 @@
hostname: "s1-t11-sut1"
grub:
hugepagesz: "2M"
- nr_hugepages: 65536
+ nr_hugepages: 32768
inventory_ipmi_hostname: "10.30.50.47"
cpu_microarchitecture: "skylake"
@@ -57,4 +57,13 @@ consul_node_role: "client"
consul_retry_servers:
- "10.30.51.30"
- "10.30.51.32"
- - "10.30.51.33" \ No newline at end of file
+ - "10.30.51.33"
+
+# Docker settings.
+docker_daemon:
+ # https://docs.docker.com/engine/reference/commandline/dockerd/#daemon-configuration-file
+ default-shm-size: "1073741824"
+ dns: [ "172.17.0.1" ]
+ dns-opts: []
+ dns-search: [ "{{ ansible_hostname }}" ]
+ host: [ "172.17.0.1:/var/run/docker.sock" ] \ No newline at end of file
diff --git a/resources/tools/testbed-setup/ansible/inventories/lf_inventory/host_vars/10.30.51.51.yaml b/resources/tools/testbed-setup/ansible/inventories/lf_inventory/host_vars/10.30.51.51.yaml
index fb10e11c76..019cd5a968 100644
--- a/resources/tools/testbed-setup/ansible/inventories/lf_inventory/host_vars/10.30.51.51.yaml
+++ b/resources/tools/testbed-setup/ansible/inventories/lf_inventory/host_vars/10.30.51.51.yaml
@@ -4,7 +4,7 @@
hostname: "s2-t12-sut1"
grub:
hugepagesz: "2M"
- nr_hugepages: 65536
+ nr_hugepages: 32768
inventory_ipmi_hostname: "10.30.50.48"
cpu_microarchitecture: "skylake"
@@ -57,4 +57,13 @@ consul_node_role: "client"
consul_retry_servers:
- "10.30.51.30"
- "10.30.51.32"
- - "10.30.51.33" \ No newline at end of file
+ - "10.30.51.33"
+
+# Docker settings.
+docker_daemon:
+ # https://docs.docker.com/engine/reference/commandline/dockerd/#daemon-configuration-file
+ default-shm-size: "1073741824"
+ dns: [ "172.17.0.1" ]
+ dns-opts: []
+ dns-search: [ "{{ ansible_hostname }}" ]
+ host: [ "172.17.0.1:/var/run/docker.sock" ] \ No newline at end of file
diff --git a/resources/tools/testbed-setup/ansible/inventories/lf_inventory/host_vars/10.30.51.65.yaml b/resources/tools/testbed-setup/ansible/inventories/lf_inventory/host_vars/10.30.51.65.yaml
index 5889b1e8b8..76b330ae2e 100644
--- a/resources/tools/testbed-setup/ansible/inventories/lf_inventory/host_vars/10.30.51.65.yaml
+++ b/resources/tools/testbed-setup/ansible/inventories/lf_inventory/host_vars/10.30.51.65.yaml
@@ -33,9 +33,10 @@ nomad_name: "{{ hostname }}-{{ ansible_architecture }}"
nomad_node_role: "client"
nomad_node_class: "builder"
nomad_options:
- driver.raw_exec.enable: 0
+ driver.raw_exec.enable: 1
docker.cleanup.image: false
docker.privileged.enabled: true
+ docker.volumes.enabled: true
driver.whitelist: "docker,raw_exec,exec"
nomad_servers: [ "10.30.51.32:4647", "10.30.51.33:4647" ]
nomad_cpu_total_compute: "40000"
@@ -56,4 +57,11 @@ consul_node_role: "client"
consul_retry_servers:
- "10.30.51.30"
- "10.30.51.32"
- - "10.30.51.33" \ No newline at end of file
+ - "10.30.51.33"
+
+# Docker daemon settings.
+docker_daemon:
+ # https://docs.docker.com/engine/reference/commandline/dockerd/#daemon-configuration-file
+ dns: [ "172.17.0.1" ]
+ dns-opts: []
+ dns-search: [ "{{ansible_hostname}}" ] \ No newline at end of file
diff --git a/resources/tools/testbed-setup/ansible/inventories/lf_inventory/host_vars/10.30.51.66.yaml b/resources/tools/testbed-setup/ansible/inventories/lf_inventory/host_vars/10.30.51.66.yaml
index 419403890b..5223e4ba11 100644
--- a/resources/tools/testbed-setup/ansible/inventories/lf_inventory/host_vars/10.30.51.66.yaml
+++ b/resources/tools/testbed-setup/ansible/inventories/lf_inventory/host_vars/10.30.51.66.yaml
@@ -33,9 +33,10 @@ nomad_name: "{{ hostname }}-{{ ansible_architecture }}"
nomad_node_role: "client"
nomad_node_class: "builder"
nomad_options:
- driver.raw_exec.enable: 0
+ driver.raw_exec.enable: 1
docker.cleanup.image: false
docker.privileged.enabled: true
+ docker.volumes.enabled: true
driver.whitelist: "docker,raw_exec,exec"
nomad_servers: [ "10.30.51.32:4647", "10.30.51.33:4647" ]
nomad_cpu_total_compute: "40000"
@@ -56,4 +57,11 @@ consul_node_role: "client"
consul_retry_servers:
- "10.30.51.30"
- "10.30.51.32"
- - "10.30.51.33" \ No newline at end of file
+ - "10.30.51.33"
+
+# Docker daemon settings.
+docker_daemon:
+ # https://docs.docker.com/engine/reference/commandline/dockerd/#daemon-configuration-file
+ dns: [ "172.17.0.1" ]
+ dns-opts: []
+ dns-search: [ "{{ansible_hostname}}" ] \ No newline at end of file
diff --git a/resources/tools/testbed-setup/ansible/inventories/lf_inventory/host_vars/10.30.51.67.yaml b/resources/tools/testbed-setup/ansible/inventories/lf_inventory/host_vars/10.30.51.67.yaml
index 05e3b2a3af..da9ed6da49 100644
--- a/resources/tools/testbed-setup/ansible/inventories/lf_inventory/host_vars/10.30.51.67.yaml
+++ b/resources/tools/testbed-setup/ansible/inventories/lf_inventory/host_vars/10.30.51.67.yaml
@@ -33,9 +33,10 @@ nomad_name: "{{ hostname }}-{{ ansible_architecture }}"
nomad_node_role: "client"
nomad_node_class: "builder"
nomad_options:
- driver.raw_exec.enable: 0
+ driver.raw_exec.enable: 1
docker.cleanup.image: false
docker.privileged.enabled: true
+ docker.volumes.enabled: true
driver.whitelist: "docker,raw_exec,exec"
nomad_servers: [ "10.30.51.32:4647", "10.30.51.33:4647" ]
nomad_cpu_total_compute: "40000"
@@ -56,4 +57,11 @@ consul_node_role: "client"
consul_retry_servers:
- "10.30.51.30"
- "10.30.51.32"
- - "10.30.51.33" \ No newline at end of file
+ - "10.30.51.33"
+
+# Docker daemon settings.
+docker_daemon:
+ # https://docs.docker.com/engine/reference/commandline/dockerd/#daemon-configuration-file
+ dns: [ "172.17.0.1" ]
+ dns-opts: []
+ dns-search: [ "{{ansible_hostname}}" ] \ No newline at end of file
diff --git a/resources/tools/testbed-setup/ansible/inventories/lf_inventory/host_vars/10.30.51.68.yaml b/resources/tools/testbed-setup/ansible/inventories/lf_inventory/host_vars/10.30.51.68.yaml
index b1df9cc69c..0295af4dbd 100644
--- a/resources/tools/testbed-setup/ansible/inventories/lf_inventory/host_vars/10.30.51.68.yaml
+++ b/resources/tools/testbed-setup/ansible/inventories/lf_inventory/host_vars/10.30.51.68.yaml
@@ -36,6 +36,7 @@ nomad_options:
driver.raw_exec.enable: 1
docker.cleanup.image: false
docker.privileged.enabled: true
+ docker.volumes.enabled: true
driver.whitelist: "docker,raw_exec,exec"
nomad_servers: [ "10.30.51.32:4647", "10.30.51.33:4647" ]
nomad_cpu_total_compute: "40000"
@@ -56,4 +57,11 @@ consul_node_role: "client"
consul_retry_servers:
- "10.30.51.30"
- "10.30.51.32"
- - "10.30.51.33" \ No newline at end of file
+ - "10.30.51.33"
+
+# Docker daemon settings.
+docker_daemon:
+ # https://docs.docker.com/engine/reference/commandline/dockerd/#daemon-configuration-file
+ dns: [ "172.17.0.1" ]
+ dns-opts: []
+ dns-search: [ "{{ansible_hostname}}" ] \ No newline at end of file
diff --git a/resources/tools/testbed-setup/ansible/inventories/lf_inventory/host_vars/10.30.51.69.yaml b/resources/tools/testbed-setup/ansible/inventories/lf_inventory/host_vars/10.30.51.69.yaml
index 0ebda86285..264c62cf38 100644
--- a/resources/tools/testbed-setup/ansible/inventories/lf_inventory/host_vars/10.30.51.69.yaml
+++ b/resources/tools/testbed-setup/ansible/inventories/lf_inventory/host_vars/10.30.51.69.yaml
@@ -1,7 +1,7 @@
---
# file: host_vars/10.30.51.69.yaml
-hostname: "s27-t34-sut1"
+hostname: "s27-t211-sut1"
grub:
audit: "0"
intel_iommu: "on"
diff --git a/resources/tools/testbed-setup/ansible/inventories/lf_inventory/host_vars/10.30.51.70.yaml b/resources/tools/testbed-setup/ansible/inventories/lf_inventory/host_vars/10.30.51.70.yaml
index 22107c4f1a..ad33f0276e 100644
--- a/resources/tools/testbed-setup/ansible/inventories/lf_inventory/host_vars/10.30.51.70.yaml
+++ b/resources/tools/testbed-setup/ansible/inventories/lf_inventory/host_vars/10.30.51.70.yaml
@@ -61,4 +61,12 @@ consul_node_role: "client"
consul_retry_servers:
- "10.30.51.30"
- "10.30.51.32"
- - "10.30.51.33" \ No newline at end of file
+ - "10.30.51.33"
+
+# Docker settings.
+docker_daemon:
+ # https://docs.docker.com/engine/reference/commandline/dockerd/#daemon-configuration-file
+ dns: [ "172.17.0.1" ]
+ dns-opts: []
+ dns-search: [ "{{ ansible_hostname }}" ]
+ storage-driver: "overlay2" \ No newline at end of file
diff --git a/resources/tools/testbed-setup/ansible/inventories/lf_inventory/host_vars/10.30.51.71.yaml b/resources/tools/testbed-setup/ansible/inventories/lf_inventory/host_vars/10.30.51.71.yaml
index f8b7c92df4..1712346116 100644
--- a/resources/tools/testbed-setup/ansible/inventories/lf_inventory/host_vars/10.30.51.71.yaml
+++ b/resources/tools/testbed-setup/ansible/inventories/lf_inventory/host_vars/10.30.51.71.yaml
@@ -61,4 +61,12 @@ consul_node_role: "client"
consul_retry_servers:
- "10.30.51.30"
- "10.30.51.32"
- - "10.30.51.33" \ No newline at end of file
+ - "10.30.51.33"
+
+# Docker settings.
+docker_daemon:
+ # https://docs.docker.com/engine/reference/commandline/dockerd/#daemon-configuration-file
+ dns: [ "172.17.0.1" ]
+ dns-opts: []
+ dns-search: [ "{{ ansible_hostname }}" ]
+ storage-driver: "overlay2" \ No newline at end of file
diff --git a/resources/tools/testbed-setup/ansible/inventories/lf_inventory/host_vars/10.32.8.14.yaml b/resources/tools/testbed-setup/ansible/inventories/lf_inventory/host_vars/10.32.8.14.yaml
index dd2a2f5b95..c609cc7875 100644
--- a/resources/tools/testbed-setup/ansible/inventories/lf_inventory/host_vars/10.32.8.14.yaml
+++ b/resources/tools/testbed-setup/ansible/inventories/lf_inventory/host_vars/10.32.8.14.yaml
@@ -29,7 +29,7 @@ nomad_datacenter: "yul1"
nomad_encrypt: "Y4T+5JGx1C3l2NFBBvkTWQ=="
nomad_name: "{{ hostname }}-{{ ansible_architecture }}"
nomad_node_role: "client"
-nomad_node_class: "s5ci"
+nomad_node_class: "builder"
nomad_options:
driver.raw_exec.enable: 1
docker.cleanup.image: true
@@ -38,13 +38,13 @@ nomad_options:
driver.whitelist: "docker,raw_exec,exec"
fingerprint.network.disallow_link_local: true
nomad_retry_servers: [ "10.30.51.30", "10.30.51.32", "10.30.51.33" ]
-nomad_servers: [ "10.30.51.32:4647", "10.30.51.33:4647" ]
+nomad_servers: [ "10.30.51.32:4647", "10.30.51.33:4647", "10.30.51.30:4647" ]
nomad_volumes:
- name: "prod-volume-data1-1"
path: "/data"
read_only: false
-# Consul settigs.
+# Consul settings.
consul_nomad_integration: true
consul_certificates:
- src: "{{ vault_consul_v1_ca_file }}"
@@ -60,4 +60,11 @@ consul_node_role: "client"
consul_retry_servers:
- "10.30.51.30"
- "10.30.51.32"
- - "10.30.51.33" \ No newline at end of file
+ - "10.30.51.33"
+
+# Docker daemon settings.
+docker_daemon:
+ # https://docs.docker.com/engine/reference/commandline/dockerd/#daemon-configuration-file
+ dns: [ "172.17.0.1" ]
+ dns-opts: []
+ dns-search: [ "{{ansible_hostname}}" ] \ No newline at end of file
diff --git a/resources/tools/testbed-setup/ansible/inventories/lf_inventory/host_vars/10.32.8.15.yaml b/resources/tools/testbed-setup/ansible/inventories/lf_inventory/host_vars/10.32.8.15.yaml
index 7e103cf8c7..c9825c7830 100644
--- a/resources/tools/testbed-setup/ansible/inventories/lf_inventory/host_vars/10.32.8.15.yaml
+++ b/resources/tools/testbed-setup/ansible/inventories/lf_inventory/host_vars/10.32.8.15.yaml
@@ -38,9 +38,9 @@ nomad_options:
driver.whitelist: "docker,raw_exec,exec"
fingerprint.network.disallow_link_local: true
nomad_retry_servers: [ "10.30.51.30", "10.30.51.32", "10.30.51.33" ]
-nomad_servers: [ "10.30.51.32:4647", "10.30.51.33:4647" ]
+nomad_servers: [ "10.30.51.32:4647", "10.30.51.33:4647", "10.30.51.30:4647" ]
nomad_volumes:
- - name: "prod-volume-data2-1"
+ - name: "prod-volume-data1-1"
path: "/data"
read_only: false
@@ -60,4 +60,11 @@ consul_node_role: "client"
consul_retry_servers:
- "10.30.51.30"
- "10.30.51.32"
- - "10.30.51.33" \ No newline at end of file
+ - "10.30.51.33"
+
+# Docker daemon settings.
+docker_daemon:
+ # https://docs.docker.com/engine/reference/commandline/dockerd/#daemon-configuration-file
+ dns: [ "172.17.0.1" ]
+ dns-opts: []
+ dns-search: [ "{{ansible_hostname}}" ] \ No newline at end of file
diff --git a/resources/tools/testbed-setup/ansible/inventories/lf_inventory/host_vars/10.32.8.16.yaml b/resources/tools/testbed-setup/ansible/inventories/lf_inventory/host_vars/10.32.8.16.yaml
index e1b47cae15..1d952556f9 100644
--- a/resources/tools/testbed-setup/ansible/inventories/lf_inventory/host_vars/10.32.8.16.yaml
+++ b/resources/tools/testbed-setup/ansible/inventories/lf_inventory/host_vars/10.32.8.16.yaml
@@ -39,6 +39,10 @@ nomad_options:
fingerprint.network.disallow_link_local: true
nomad_retry_servers: [ "10.30.51.30", "10.30.51.32", "10.30.51.33" ]
nomad_servers: [ "10.30.51.32:4647", "10.30.51.33:4647", "10.30.51.30:4647" ]
+nomad_volumes:
+ - name: "prod-volume-data1-1"
+ path: "/data"
+ read_only: false
# Consul settigs.
consul_nomad_integration: true
@@ -56,4 +60,11 @@ consul_node_role: "client"
consul_retry_servers:
- "10.30.51.30"
- "10.30.51.32"
- - "10.30.51.33" \ No newline at end of file
+ - "10.30.51.33"
+
+# Docker daemon settings.
+docker_daemon:
+ # https://docs.docker.com/engine/reference/commandline/dockerd/#daemon-configuration-file
+ dns: [ "172.17.0.1" ]
+ dns-opts: []
+ dns-search: [ "{{ansible_hostname}}" ] \ No newline at end of file
diff --git a/resources/tools/testbed-setup/ansible/inventories/lf_inventory/host_vars/10.32.8.17.yaml b/resources/tools/testbed-setup/ansible/inventories/lf_inventory/host_vars/10.32.8.17.yaml
index 6a4e238bdc..2feac858d9 100644
--- a/resources/tools/testbed-setup/ansible/inventories/lf_inventory/host_vars/10.32.8.17.yaml
+++ b/resources/tools/testbed-setup/ansible/inventories/lf_inventory/host_vars/10.32.8.17.yaml
@@ -39,6 +39,10 @@ nomad_options:
fingerprint.network.disallow_link_local: true
nomad_retry_servers: [ "10.30.51.30", "10.30.51.32", "10.30.51.33" ]
nomad_servers: [ "10.30.51.32:4647", "10.30.51.33:4647", "10.30.51.30:4647" ]
+nomad_volumes:
+ - name: "prod-volume-data1-1"
+ path: "/data"
+ read_only: false
# Consul settigs.
consul_nomad_integration: true
@@ -56,4 +60,11 @@ consul_node_role: "client"
consul_retry_servers:
- "10.30.51.30"
- "10.30.51.32"
- - "10.30.51.33" \ No newline at end of file
+ - "10.30.51.33"
+
+# Docker daemon settings.
+docker_daemon:
+ # https://docs.docker.com/engine/reference/commandline/dockerd/#daemon-configuration-file
+ dns: [ "172.17.0.1" ]
+ dns-opts: []
+ dns-search: [ "{{ansible_hostname}}" ] \ No newline at end of file
diff --git a/resources/tools/testbed-setup/ansible/inventories/lf_inventory/hosts b/resources/tools/testbed-setup/ansible/inventories/lf_inventory/hosts
index 741da675f1..cc0e0b3986 100644
--- a/resources/tools/testbed-setup/ansible/inventories/lf_inventory/hosts
+++ b/resources/tools/testbed-setup/ansible/inventories/lf_inventory/hosts
@@ -44,6 +44,7 @@ all:
10.32.8.24: #s60-t210-sut1 - epyc
10.30.51.69: #s27-t34-sut1 - thunderx2 9975
vpp_device:
+ # Note: vpp_device hosts are also nomad client hosts
hosts:
10.30.51.50: #s1-t11-sut1 - skylake
10.30.51.51: #s2-t12-sut1 - skylake
diff --git a/resources/tools/testbed-setup/ansible/nomad.yaml b/resources/tools/testbed-setup/ansible/nomad.yaml
index 653215651a..ae9de1e524 100644
--- a/resources/tools/testbed-setup/ansible/nomad.yaml
+++ b/resources/tools/testbed-setup/ansible/nomad.yaml
@@ -5,6 +5,12 @@
remote_user: testuser
become: yes
become_user: root
+ gather_facts: false
+ pre_tasks:
+ - name: Gathering Facts
+ gather_facts:
+ tags:
+ - always
roles:
- role: user_add
tags: user_add
@@ -15,4 +21,12 @@
- role: nomad
tags: nomad
- role: consul
- tags: consul \ No newline at end of file
+ tags: consul
+ - role: prometheus_exporter
+ tags: prometheus_exporter
+ - role: jenkins_job_health_exporter
+ tags: jenkins_job_health_exporter
+ - role: cadvisor
+ tags: cadvisor
+ - role: cleanup
+ tags: cleanup \ No newline at end of file
diff --git a/resources/tools/testbed-setup/ansible/roles/ab/defaults/main.yaml b/resources/tools/testbed-setup/ansible/roles/ab/defaults/main.yaml
index 0609b06855..45b80be42d 100644
--- a/resources/tools/testbed-setup/ansible/roles/ab/defaults/main.yaml
+++ b/resources/tools/testbed-setup/ansible/roles/ab/defaults/main.yaml
@@ -1,14 +1,17 @@
---
# file: roles/wrk/defaults/main.yaml
-packages: "{{ packages_base + packages_by_distro[ansible_distribution | lower] + packages_by_arch[ansible_machine] }}"
+packages: "{{ packages_base + packages_by_distro[ansible_distribution|lower][ansible_distribution_release] + packages_by_arch[ansible_machine] }}"
packages_base:
- []
packages_by_distro:
ubuntu:
- - "apache2-utils"
+ bionic:
+ - "apache2-utils"
+ focal:
+ - "apache2-utils"
packages_by_arch:
aarch64:
diff --git a/resources/tools/testbed-setup/ansible/roles/ab/tasks/main.yaml b/resources/tools/testbed-setup/ansible/roles/ab/tasks/main.yaml
index 9153198cfb..37e702e6df 100644
--- a/resources/tools/testbed-setup/ansible/roles/ab/tasks/main.yaml
+++ b/resources/tools/testbed-setup/ansible/roles/ab/tasks/main.yaml
@@ -1,9 +1,18 @@
---
# file: roles/ab/tasks/main.yaml
-- name: Install Apache ab tools
+- name: Inst - Update Package Cache (APT)
+ apt:
+ update_cache: yes
+ cache_valid_time: 3600
+ when:
+ - ansible_distribution|lower == 'ubuntu'
+ tags:
+ - ab-inst-prerequisites
+
+- name: Inst - Apache ab tools
package:
name: "{{ packages | flatten(levels=1) }}"
state: present
- update_cache: true
- tags: install-ab
+ tags:
+ - ab-inst \ No newline at end of file
diff --git a/resources/tools/testbed-setup/ansible/roles/aws/tasks/main.yaml b/resources/tools/testbed-setup/ansible/roles/aws/tasks/main.yaml
index 7b7a7fbb1a..deca0670ec 100644
--- a/resources/tools/testbed-setup/ansible/roles/aws/tasks/main.yaml
+++ b/resources/tools/testbed-setup/ansible/roles/aws/tasks/main.yaml
@@ -1,26 +1,40 @@
---
# file: roles/aws/tasks/main.yaml
-- name: AWS - Edit repositories
- include_tasks: '{{ ansible_distribution|lower }}_{{ ansible_distribution_release }}.yaml'
- tags: edit-repo
+- name: Edit repositories
+ include_tasks: "{{ ansible_distribution|lower }}_{{ ansible_distribution_release }}.yaml"
+ tags:
+ - edit-repo
+
+- name: Get vfio-pci With WC Patcher
+ get_url:
+ url: "https://github.com/amzn/amzn-drivers/raw/master/userspace/dpdk/enav2-vfio-patch/get-vfio-with-wc.sh"
+ dest: "/opt/get-vfio-with-wc.sh"
+ mode: "744"
+ tags:
+ - vfio-aws-patch
+
+- name: Create vfio-pci Patch Directory
+ file:
+ path: "/opt/patches/"
+ state: "directory"
+ tags:
+ - vfio-aws-patch
-- name: AWS - Get vfio-pci Patcher Script
+- name: Get vfio-pci WC Patch
get_url:
- url: "https://github.com/amzn/amzn-drivers/raw/master/userspace/dpdk/enav2-vfio-patch/vfio-wc-patch.sh"
- dest: "/opt/vfio-wc-patch.sh"
+ url: "https://github.com/amzn/amzn-drivers/raw/master/userspace/dpdk/enav2-vfio-patch/patches/linux-4.10-vfio-wc.patch"
+ dest: "/opt/patches/linux-4.10-vfio-wc.patch"
mode: "744"
- register: "vfio_patch_downloaded"
tags:
- vfio-aws-patch
-- name: AWS - Patch vfio-pci
- shell: "/bin/bash /opt/vfio-wc-patch.sh"
- when: "vfio_patch_downloaded"
+- name: Compile vfio-pci With WC Patch
+ shell: "/bin/bash /opt/get-vfio-with-wc.sh"
tags:
- vfio-aws-patch
-- name: AWS - Load Kernel Modules By Default
+- name: Load Kernel Modules By Default
lineinfile:
path: "/etc/modules"
state: "present"
@@ -28,11 +42,10 @@
with_items:
- "vfio-pci"
- "igb_uio"
- register: "modules_added"
tags:
- load-kernel-modules
-- name: AWS - Add Kernel Modules Options
+- name: Add Kernel Modules Options (igb_uio)
lineinfile:
path: "/etc/modprobe.d/igb_uio.conf"
state: "present"
@@ -40,20 +53,28 @@
create: "yes"
with_items:
- "options igb_uio wc_activate=1"
- when: "modules_added"
- register: "modules_added"
tags:
- load-kernel-modules
-- name: AWS - Reload systemd-modules
+- name: Add Kernel Modules Options (vfio-pci)
+ lineinfile:
+ path: "/etc/modprobe.d/vfio-noiommu.conf"
+ state: "present"
+ line: "{{ item }}"
+ create: "yes"
+ with_items:
+ - "options vfio enable_unsafe_noiommu_mode=1"
+ tags:
+ - load-kernel-modules
+
+- name: Reload systemd-modules
systemd:
name: "systemd-modules-load"
state: "restarted"
- when: "modules_added"
tags:
- reload-systemd-modules
-- name: AWS - Performance Tuning - Adjust nr_hugepages
+- name: Performance Tuning - Adjust nr_hugepages
sysctl:
name: "vm.nr_hugepages"
value: "8192"
diff --git a/resources/tools/testbed-setup/ansible/roles/aws/tasks/ubuntu_bionic.yaml b/resources/tools/testbed-setup/ansible/roles/aws/tasks/ubuntu_bionic.yaml
index 75e4a3ae57..ba6e107e45 100644
--- a/resources/tools/testbed-setup/ansible/roles/aws/tasks/ubuntu_bionic.yaml
+++ b/resources/tools/testbed-setup/ansible/roles/aws/tasks/ubuntu_bionic.yaml
@@ -1,10 +1,10 @@
---
-# file: roles/aws/tasks/ubuntu_bionic.yaml
+# file: roles/aws/tasks/ubuntu_bionic.yaml.yaml
-- name: AWS - Enable deb-src APT Repositories
- replace:
- path: "/etc/apt/sources.list"
- regexp: "^# deb-src "
- replace: "deb-src "
+- name: Enable deb-src APT Repository
+ apt_repository:
+ repo: "deb-src http://archive.ubuntu.com/ubuntu bionic main"
+ state: "present"
+ update_cache: yes
tags:
- - enable-src-repo
+ - enable-src-repo \ No newline at end of file
diff --git a/resources/tools/testbed-setup/ansible/roles/cadvisor/defaults/main.yaml b/resources/tools/testbed-setup/ansible/roles/cadvisor/defaults/main.yaml
new file mode 100644
index 0000000000..3b25e551ea
--- /dev/null
+++ b/resources/tools/testbed-setup/ansible/roles/cadvisor/defaults/main.yaml
@@ -0,0 +1,24 @@
+---
+# file: roles/cadvisor/defaults/main.yaml
+
+packages: "{{ packages_base + packages_by_distro[ansible_distribution | lower] + packages_by_arch[ansible_machine] }}"
+
+packages_base:
+ - []
+
+packages_by_distro:
+ ubuntu:
+ - "python3-docker"
+ - "python3-dockerpty"
+
+packages_by_arch:
+ aarch64:
+ - []
+ x86_64:
+ - []
+
+image: "{{ image_by_arch[ansible_machine] }}"
+
+image_by_arch:
+ aarch64: "zcube/cadvisor:v0.37.0"
+ x86_64: "gcr.io/cadvisor/cadvisor:v0.38.7" \ No newline at end of file
diff --git a/resources/tools/testbed-setup/ansible/roles/cadvisor/tasks/main.yaml b/resources/tools/testbed-setup/ansible/roles/cadvisor/tasks/main.yaml
new file mode 100644
index 0000000000..a2a13368c2
--- /dev/null
+++ b/resources/tools/testbed-setup/ansible/roles/cadvisor/tasks/main.yaml
@@ -0,0 +1,39 @@
+---
+# file: roles/cadvisor/tasks/main.yaml
+
+- name: Inst - Update Package Cache (APT)
+ apt:
+ update_cache: yes
+ cache_valid_time: 3600
+ when:
+ - ansible_distribution|lower == 'ubuntu'
+ tags:
+ - cadvisor-inst-prerequisites
+
+- name: Inst - Prerequisites
+ package:
+ name: "{{ packages | flatten(levels=1) }}"
+ state: latest
+ tags:
+ - cadvisor-inst-prerequisites
+
+- name: Inst - Start a container
+ docker_container:
+ name: "cAdvisor"
+ image: "{{ image }}"
+ state: "started"
+ restart_policy: "unless-stopped"
+ detach: yes
+ devices:
+ - "/dev/kmsg"
+ ports:
+ - "8080:8080"
+ privileged: yes
+ volumes:
+ - "/:/rootfs:ro"
+ - "/var/run:/var/run:ro"
+ - "/sys:/sys:ro"
+ - "/var/lib/docker/:/var/lib/docker:ro"
+ - "/dev/disk/:/dev/disk:ro"
+ tags:
+ - cadvisor-run-container
diff --git a/resources/tools/testbed-setup/ansible/roles/calibration/defaults/main.yaml b/resources/tools/testbed-setup/ansible/roles/calibration/defaults/main.yaml
index c639f11088..5eb945328c 100644
--- a/resources/tools/testbed-setup/ansible/roles/calibration/defaults/main.yaml
+++ b/resources/tools/testbed-setup/ansible/roles/calibration/defaults/main.yaml
@@ -2,15 +2,19 @@
# file: roles/calibration/defaults/main.yaml
# Packages to install.
-packages: "{{ packages_base + packages_by_distro[ansible_distribution | lower] + packages_by_arch[ansible_machine] }}"
+packages: "{{ packages_base + packages_by_distro[ansible_distribution|lower][ansible_distribution_release] + packages_by_arch[ansible_machine] }}"
packages_base:
- []
packages_by_distro:
ubuntu:
- - "build-essential"
- - "dmidecode"
+ bionic:
+ - "build-essential"
+ - "dmidecode"
+ focal:
+ - "build-essential"
+ - "dmidecode"
packages_by_arch:
aarch64:
@@ -19,16 +23,24 @@ packages_by_arch:
- []
# Kernel version to check.
-kernel_version: "{{ kernel_version_by_distro_by_arch[ansible_distribution | lower][ansible_machine] }}"
+kernel_version: "{{ kernel_version_by_distro_by_arch[ansible_distribution | lower][ansible_distribution_release][ansible_machine] }}"
kernel_version_by_distro_by_arch:
ubuntu:
- x86_64:
- - "4.15.0-72-generic"
- - "5.3.0-1020-azure"
- - "5.3.0-1017-aws"
- aarch64:
- - "4.15.0-54-generic"
+ bionic:
+ x86_64:
+ - "4.15.0-72-generic"
+ - "5.3.0-1020-azure"
+ - "4.15.0-1057-aws"
+ aarch64:
+ - "4.15.0-54-generic"
+ focal:
+ x86_64:
+ - "5.4.0-65-generic"
+ - "5.3.0-1020-azure"
+ - "5.3.0-1017-aws"
+ aarch64:
+ - "5.4.0-65-generic"
pma_directory: "/tmp/pma_tools"
jitter_core: 7
diff --git a/resources/tools/testbed-setup/ansible/roles/calibration/tasks/main.yaml b/resources/tools/testbed-setup/ansible/roles/calibration/tasks/main.yaml
index a6e8898f9d..5d0c3b18ff 100644
--- a/resources/tools/testbed-setup/ansible/roles/calibration/tasks/main.yaml
+++ b/resources/tools/testbed-setup/ansible/roles/calibration/tasks/main.yaml
@@ -1,13 +1,21 @@
---
# file: roles/calibration/tasks/main.yaml
-- name: Install Distribution - Release - Machine Prerequisites
+- name: Inst - Update Package Cache (APT)
+ apt:
+ update_cache: yes
+ cache_valid_time: 3600
+ when:
+ - ansible_distribution|lower == 'ubuntu'
+ tags:
+ - calibration-inst-prerequisites
+
+- name: Inst - Prerequisites
package:
name: "{{ packages | flatten(levels=1) }}"
state: latest
- update_cache: true
tags:
- - install-dependencies
+ - calibration-inst-prerequisites
- name: Check CPU Power States
shell: "lscpu"
diff --git a/resources/tools/testbed-setup/ansible/roles/cleanup/tasks/clean_images.yaml b/resources/tools/testbed-setup/ansible/roles/cleanup/tasks/clean_images.yaml
new file mode 100644
index 0000000000..e030acbff2
--- /dev/null
+++ b/resources/tools/testbed-setup/ansible/roles/cleanup/tasks/clean_images.yaml
@@ -0,0 +1,36 @@
+---
+# file: roles/cleanup/tasks/clean_images.yaml
+
+- name: Clean Docker Images
+ block:
+ - name: Clean Images - Prefetch Docker Images
+ cron:
+ name: "Prefetch docker image {{ item }}"
+ minute: "10"
+ hour: "7"
+ job: "/usr/bin/docker pull {{ item }}"
+ loop:
+ "{{ images_to_prefetch_by_arch[ansible_machine] }}"
+ tags:
+ - prefetch-docker-images
+
+ - name: Clean Images - Remove Dangling Docker Images
+ cron:
+ name: "Remove dangling docker images"
+ minute: "10"
+ hour: "5"
+ weekday: "7"
+ job: "/usr/bin/docker rmi $(/usr/bin/docker images --filter 'dangling=true' -q)"
+ tags:
+ - remove-docker-images-dangling
+
+ # TODO: Disabled until all images will be in registry
+ #- name: Clean Images - Prune Docker Images
+ # cron:
+ # name: "Prune docker images"
+ # minute: "10"
+ # hour: "6"
+ # weekday: 7
+ # job: "/usr/bin/docker image prune --all --force"
+ # tags:
+ # - prune-docker-images \ No newline at end of file
diff --git a/resources/tools/testbed-setup/ansible/roles/cleanup/tasks/kill_containers.yaml b/resources/tools/testbed-setup/ansible/roles/cleanup/tasks/kill_containers.yaml
index ad4fb37681..25fd48e420 100644
--- a/resources/tools/testbed-setup/ansible/roles/cleanup/tasks/kill_containers.yaml
+++ b/resources/tools/testbed-setup/ansible/roles/cleanup/tasks/kill_containers.yaml
@@ -3,16 +3,18 @@
- name: Kill Docker Containers
block:
- - name: Kill container - Get Running Docker Containers
+ - name: Kill Container - Get Running Docker Containers
shell: "docker ps -aq"
register: running_containers
changed_when: no
- tags: kill-containers
+ tags:
+ - kill-containers
- - name: Kill container - Remove All Docker Containers
+ - name: Kill Container - Remove All Docker Containers
shell: "docker rm --force {{ item }}"
with_items: "{{ running_containers.stdout_lines }}"
- tags: kill-containers
+ tags:
+ - kill-containers
rescue:
- name: Restart Docker Daemon
@@ -22,16 +24,18 @@
- name: Kill LXC Containers
block:
- - name: Kill container - Get Running LXC Containers
+ - name: Kill Container - Get Running LXC Containers
shell: "lxc-ls"
register: running_containers
changed_when: no
- tags: kill-containers
+ tags:
+ - kill-containers
- - name: Kill container - Remove All LXC Containers
+ - name: Kill Container - Remove All LXC Containers
shell: "lxc-destroy --force -n {{ item }}"
with_items: "{{ running_containers.stdout_lines }}"
- tags: kill-containers
+ tags:
+ - kill-containers
rescue:
- fail:
diff --git a/resources/tools/testbed-setup/ansible/roles/cleanup/tasks/kill_process.yaml b/resources/tools/testbed-setup/ansible/roles/cleanup/tasks/kill_process.yaml
index 30a9f459b5..c7cee37485 100644
--- a/resources/tools/testbed-setup/ansible/roles/cleanup/tasks/kill_process.yaml
+++ b/resources/tools/testbed-setup/ansible/roles/cleanup/tasks/kill_process.yaml
@@ -1,32 +1,36 @@
---
# file: roles/cleanup/tasks/kill_process.yaml
-- name: Kill process - {{ process }}
+- name: Kill Process - {{ process }}
block:
- - name: Kill process - Get pid of {{ process }}
+ - name: Get PID Of {{ process }}
shell: "ps -ef | grep -v grep | grep -w {{ process }} | awk '{print $2}'"
- when: >
- process is defined and process != ""
+ when:
+ - process is defined and process != ""
register: running_processes
- tags: kill-process
+ tags:
+ - kill-process
- - name: Kill process - Safe kill {{ process }}
+ - name: Safe Kill {{ process }}
shell: "kill {{ item }}"
with_items: "{{ running_processes.stdout_lines }}"
- tags: kill-process
+ tags:
+ - kill-process
- wait_for:
path: "/proc/{{ item }}/status"
- state: absent
+ state: "absent"
with_items: "{{ running_processes.stdout_lines }}"
ignore_errors: yes
register: killed_processes
- tags: kill-process
+ tags:
+ - kill-process
- - name: Kill process - Force kill {{ process }}
+ - name: Kill Process - Force Kill {{ process }}
shell: "kill -9 {{ item }}"
with_items: "{{ killed_processes.results | select('failed') | map(attribute='item') | list }}"
- tags: kill-process
+ tags:
+ - kill-process
rescue:
- fail:
diff --git a/resources/tools/testbed-setup/ansible/roles/cleanup/tasks/main.yaml b/resources/tools/testbed-setup/ansible/roles/cleanup/tasks/main.yaml
index 64a55c4672..eeda0139b3 100644
--- a/resources/tools/testbed-setup/ansible/roles/cleanup/tasks/main.yaml
+++ b/resources/tools/testbed-setup/ansible/roles/cleanup/tasks/main.yaml
@@ -14,18 +14,30 @@
# - vpp_device
# - Run tasks on vpp_device servers only.
# - Reset SRIOV
+# - Docker image cleanup
+# - nomad
+# - Docker image cleanup
- name: tg specific
include_tasks: tg.yaml
when: "'tg' in group_names"
- tags: cleanup
+ tags:
+ - cleanup
- name: sut specific
include_tasks: sut.yaml
when: "'sut' in group_names"
- tags: cleanup
+ tags:
+ - cleanup
- name: vpp_device specific
include_tasks: vpp_device.yaml
when: "'vpp_device' in group_names"
- tags: cleanup
+ tags:
+ - cleanup
+
+- name: nomad specific
+ include_tasks: nomad.yaml
+ when: "'nomad' in group_names"
+ tags:
+ - cleanup
diff --git a/resources/tools/testbed-setup/ansible/roles/cleanup/tasks/nomad.yaml b/resources/tools/testbed-setup/ansible/roles/cleanup/tasks/nomad.yaml
new file mode 100644
index 0000000000..3c5bf6462d
--- /dev/null
+++ b/resources/tools/testbed-setup/ansible/roles/cleanup/tasks/nomad.yaml
@@ -0,0 +1,22 @@
+---
+# file: roles/cleanup/tasks/nomad.yaml
+
+- name: Host Cleanup
+ block:
+ - name: Clean Images
+ import_tasks: clean_images.yaml
+ vars:
+ images_to_prefetch_by_arch:
+ aarch64:
+ - "fdiotools/builder-ubuntu2004:prod-aarch64"
+ - "fdiotools/builder-ubuntu1804:prod-aarch64"
+ - "fdiotools/builder-centos8:prod-aarch64"
+ x86_64:
+ - "fdiotools/builder-ubuntu2004:prod-x86_64"
+ - "fdiotools/builder-ubuntu1804:prod-x86_64"
+ - "fdiotools/builder-debian10:prod-x86_64"
+ - "fdiotools/builder-debian9:prod-x86_64"
+ - "fdiotools/builder-centos8:prod-x86_64"
+ - "fdiotools/builder-centos7:prod-x86_64"
+ tags:
+ - clean-images \ No newline at end of file
diff --git a/resources/tools/testbed-setup/ansible/roles/cleanup/tasks/remove_package.yaml b/resources/tools/testbed-setup/ansible/roles/cleanup/tasks/remove_package.yaml
index 0c8816fe29..302b43c99a 100644
--- a/resources/tools/testbed-setup/ansible/roles/cleanup/tasks/remove_package.yaml
+++ b/resources/tools/testbed-setup/ansible/roles/cleanup/tasks/remove_package.yaml
@@ -1,19 +1,21 @@
---
# file: roles/cleanup/tasks/remove_package.yaml
-- name: Remove package - Fix corrupted apt
- shell: 'dpkg --configure -a'
- when: >
- ansible_distribution == 'Ubuntu'
- tags: remove-package
+- name: Remove Package - Fix Corrupted APT
+ shell: "dpkg --configure -a"
+ when:
+ - ansible_distribution == 'Ubuntu'
+ tags:
+ - remove-package
-- name: Remove package - {{ package }}
+- name: Remove Package - {{ package }}
apt:
- name: '{{ package }}'
+ name: "{{ package }}"
force: yes
purge: yes
- state: absent
+ state: "absent"
failed_when: no
- when: >
- ansible_distribution == 'Ubuntu'
- tags: remove-package
+ when:
+ - ansible_distribution == 'Ubuntu'
+ tags:
+ - remove-package
diff --git a/resources/tools/testbed-setup/ansible/roles/cleanup/tasks/sut.yaml b/resources/tools/testbed-setup/ansible/roles/cleanup/tasks/sut.yaml
index 53a65dd608..d80a35b1cb 100644
--- a/resources/tools/testbed-setup/ansible/roles/cleanup/tasks/sut.yaml
+++ b/resources/tools/testbed-setup/ansible/roles/cleanup/tasks/sut.yaml
@@ -1,72 +1,83 @@
---
# file: roles/cleanup/tasks/sut.yaml
-- name: Host cleanup
+- name: Host Cleanup
block:
- - name: Kill processes - qemu
+ - name: Kill Processes - Qemu
import_tasks: kill_process.yaml
vars:
process: "qemu"
- tags: kill-process
+ tags:
+ - kill-process
- - name: Kill processes - l3fwd
+ - name: Kill Processes - L3fwd
import_tasks: kill_process.yaml
vars:
process: "l3fwd"
- tags: kill-process
+ tags:
+ - kill-process
- - name: Kill processes - testpmd
+ - name: Kill Processes - Testpmd
import_tasks: kill_process.yaml
vars:
process: "testpmd"
- tags: kill-process
+ tags:
+ - kill-process
- - name: Kill processes - iperf3
+ - name: Kill Processes - iPerf3
import_tasks: kill_process.yaml
vars:
process: "iperf3"
- tags: kill-process
+ tags:
+ - kill-process
- - name: Kill processes - vpp_echo
+ - name: Kill Processes - vpp_echo
import_tasks: kill_process.yaml
vars:
process: "vpp_echo"
- tags: kill-process
+ tags:
+ - kill-process
- - name: Find file or dir - Core zip file
+ - name: Find File Or Dir - Core Zip File
find:
paths: "/tmp/"
patterns: "*tar.lzo.lrz.xz*"
register: files_to_delete
- tags: remove-file-dir
+ tags:
+ - remove-file-dir
- - name: Remove file or dir - Core zip file
+ - name: Remove File Or Dir - Core Zip File
file:
path: "{{ item.path }}"
state: absent
with_items: "{{ files_to_delete.files }}"
- tags: remove-file-dir
+ tags:
+ - remove-file-dir
- - name: Find file or dir - Core dump file
+ - name: Find File Or Dir - Core Dump File
find:
paths: "/tmp/"
patterns: "*core*"
register: files_to_delete
- tags: remove-file-dir
+ tags:
+ - remove-file-dir
- - name: Remove file or dir - Core dump file
+ - name: Remove File Or Dir - Core Dump File
file:
path: "{{ item.path }}"
state: absent
with_items: "{{ files_to_delete.files }}"
- tags: remove-file-dir
+ tags:
+ - remove-file-dir
- - name: Kill containers - Remove all containers
+ - name: Kill Containers - Remove All Containers
import_tasks: kill_containers.yaml
- tags: kill-containers
+ tags:
+ - kill-containers
- - name: Remove packages - Remove VPP
+ - name: Remove Packages - Remove VPP
import_tasks: remove_package.yaml
vars:
package: "*vpp*"
- tags: remove-package
+ tags:
+ - remove-package
diff --git a/resources/tools/testbed-setup/ansible/roles/cleanup/tasks/tg.yaml b/resources/tools/testbed-setup/ansible/roles/cleanup/tasks/tg.yaml
index 9ac83bc9fc..fa2d2d2819 100644
--- a/resources/tools/testbed-setup/ansible/roles/cleanup/tasks/tg.yaml
+++ b/resources/tools/testbed-setup/ansible/roles/cleanup/tasks/tg.yaml
@@ -1,9 +1,13 @@
---
# file: roles/cleanup/tasks/tg.yaml
-- name: Kill processes - TRex
- import_tasks: kill_process.yaml
- vars:
- process: "_t-rex"
- when: docker_tg is undefined
- tags: kill-process
+- name: Host Cleanup
+ block:
+ - name: Kill Processes - TRex
+ import_tasks: kill_process.yaml
+ vars:
+ process: "_t-rex"
+ when:
+ - docker_tg is undefined
+ tags:
+ - kill-process
diff --git a/resources/tools/testbed-setup/ansible/roles/cleanup/tasks/vpp_device.yaml b/resources/tools/testbed-setup/ansible/roles/cleanup/tasks/vpp_device.yaml
index f5e6ea5488..41c4b29d37 100644
--- a/resources/tools/testbed-setup/ansible/roles/cleanup/tasks/vpp_device.yaml
+++ b/resources/tools/testbed-setup/ansible/roles/cleanup/tasks/vpp_device.yaml
@@ -1,15 +1,32 @@
---
# file: roles/cleanup/tasks/vpp_device.yaml
-- name: Reset vpp_device binary
- copy:
- src: 'files/reset_vppdevice.sh'
- dest: '/usr/local/bin'
- owner: 'root'
- group: 'root'
- mode: '744'
- tags: reset-sriov
+- name: Host Cleanup
+ block:
+ - name: Reset vpp_device Binary
+ copy:
+ src: "files/reset_vppdevice.sh"
+ dest: "/usr/local/bin"
+ owner: "root"
+ group: "root"
+ mode: "744"
+ tags:
+ - reset-sriov
-- name: Reset vpp_device
- raw: 'reset_vppdevice.sh --force'
- tags: reset-sriov
+ - name: Clean Images
+ import_tasks: clean_images.yaml
+ vars:
+ images_to_prefetch_by_arch:
+ aarch64:
+ - "fdiotools/builder-ubuntu2004:prod-aarch64"
+ - "fdiotools/builder-ubuntu1804:prod-aarch64"
+ - "fdiotools/builder-centos8:prod-aarch64"
+ x86_64:
+ - "fdiotools/builder-ubuntu2004:prod-x86_64"
+ - "fdiotools/builder-ubuntu1804:prod-x86_64"
+ - "fdiotools/builder-debian10:prod-x86_64"
+ - "fdiotools/builder-debian9:prod-x86_64"
+ - "fdiotools/builder-centos8:prod-x86_64"
+ - "fdiotools/builder-centos7:prod-x86_64"
+ tags:
+ - clean-images \ No newline at end of file
diff --git a/resources/tools/testbed-setup/ansible/roles/common/defaults/main.yaml b/resources/tools/testbed-setup/ansible/roles/common/defaults/main.yaml
index 5517b20e1f..43e40ebdf6 100644
--- a/resources/tools/testbed-setup/ansible/roles/common/defaults/main.yaml
+++ b/resources/tools/testbed-setup/ansible/roles/common/defaults/main.yaml
@@ -1,7 +1,7 @@
---
# file: roles/common/defaults/main.yaml
-packages: "{{ packages_base + packages_by_distro[ansible_distribution | lower] + packages_by_arch[ansible_machine] }}"
+packages: "{{ packages_base + packages_by_distro[ansible_distribution|lower][ansible_distribution_release] + packages_by_arch[ansible_machine] }}"
packages_base:
- "autoconf"
@@ -17,23 +17,37 @@ packages_base:
packages_by_distro:
ubuntu:
- - "build-essential"
- - "libpcap-dev"
- - "net-tools"
- - "python-all"
- - "python-apt"
- - "python-cffi"
- - "python-cffi-backend"
- - "python-dev"
- - "python-pip"
- - "python-setuptools"
- - "python3-all"
- - "python3-apt"
- - "python3-cffi"
- - "python3-cffi-backend"
- - "python3-dev"
- - "python3-pip"
- - "python3-setuptools"
+ bionic:
+ - "build-essential"
+ - "libpcap-dev"
+ - "net-tools"
+ - "python-all"
+ - "python-apt"
+ - "python-cffi"
+ - "python-cffi-backend"
+ - "python-dev"
+ - "python-pip"
+ - "python-setuptools"
+ - "python3-all"
+ - "python3-apt"
+ - "python3-cffi"
+ - "python3-cffi-backend"
+ - "python3-dev"
+ - "python3-pip"
+ - "python3-pyelftools"
+ - "python3-setuptools"
+ focal:
+ - "build-essential"
+ - "libpcap-dev"
+ - "net-tools"
+ - "python3-all"
+ - "python3-apt"
+ - "python3-cffi"
+ - "python3-cffi-backend"
+ - "python3-dev"
+ - "python3-pip"
+ - "python3-pyelftools"
+ - "python3-setuptools"
packages_by_arch:
aarch64:
@@ -55,5 +69,4 @@ packages_by_arch:
# ftp_proxy: http://proxy.com:80
# FTP_PROXY: http://proxy.com:80
# no_proxy: localhost,127.0.0.1,{{ ansible_default_ipv4.address }}
-# NO_PROXY: localhost,127.0.0.1,{{ ansible_default_ipv4.address }}
-
+# NO_PROXY: localhost,127.0.0.1,{{ ansible_default_ipv4.address }} \ No newline at end of file
diff --git a/resources/tools/testbed-setup/ansible/roles/common/handlers/main.yaml b/resources/tools/testbed-setup/ansible/roles/common/handlers/main.yaml
index e7327d3944..bb317e8067 100644
--- a/resources/tools/testbed-setup/ansible/roles/common/handlers/main.yaml
+++ b/resources/tools/testbed-setup/ansible/roles/common/handlers/main.yaml
@@ -1,7 +1,7 @@
---
# file: roles/common/handlers/main.yaml
-- name: Reboot server
+- name: Reboot Server
reboot:
reboot_timeout: 3600
tags:
diff --git a/resources/tools/testbed-setup/ansible/roles/common/tasks/main.yaml b/resources/tools/testbed-setup/ansible/roles/common/tasks/main.yaml
index e60b32ae2a..91fcd188a6 100644
--- a/resources/tools/testbed-setup/ansible/roles/common/tasks/main.yaml
+++ b/resources/tools/testbed-setup/ansible/roles/common/tasks/main.yaml
@@ -1,7 +1,7 @@
---
# file: roles/common/tasks/main.yaml
-- name: Add permanent proxy settings
+- name: Conf - Add permanent proxy settings
lineinfile:
path: "/etc/environment"
state: "present"
@@ -9,17 +9,25 @@
with_dict: "{{ proxy_env }}"
when: proxy_env is defined
tags:
- - set-proxy
+ - common-conf-proxy
-- name: Install Distribution - Release - Machine Prerequisites
+- name: Inst - Update package cache (apt)
+ apt:
+ update_cache: yes
+ cache_valid_time: 3600
+ when:
+ - ansible_distribution|lower == 'ubuntu'
+ tags:
+ - common-inst-prerequisites
+
+- name: Inst - Prerequisites
package:
name: "{{ packages | flatten(levels=1) }}"
state: latest
- update_cache: true
tags:
- - install-dependencies
+ - common-inst-prerequisites
-- name: Install CSIT PIP requirements
+- name: Inst - CSIT PIP requirements
pip:
name:
- "ecdsa==0.13.3"
@@ -31,7 +39,7 @@
- "robotframework==3.1.2"
- "scapy==2.4.3"
- "scp==0.13.2"
- - "ansible==2.7.8"
+ - "ansible==2.10.7"
- "dill==0.2.8.2"
- "numpy==1.17.3"
- "hdrhistogram==0.6.1"
@@ -73,24 +81,24 @@
- "sphinxcontrib-serializinghtml==1.1.3"
- "urllib3==1.25.6"
tags:
- - install-pip
+ - common-inst-pip
-- name: Install CSIT PIP requirements - Pandas and SciPy workaround
+- name: Inst - CSIT PIP requirements - Pandas and SciPy workaround
pip:
name:
- "pandas==0.25.3"
- - "scipy==1.1.0"
+ - "scipy==1.5.4"
tags:
- - install-pip
+ - common-inst-pip
-- name: Install Meson (repository version is too old)
+- name: Inst - Meson (DPDK)
pip:
name:
- "meson==0.47.1"
tags:
- - install-meson
+ - common-inst-meson
-- name: Set sudoers admin
+- name: Conf - sudoers admin
lineinfile:
path: "/etc/sudoers"
state: "present"
@@ -98,9 +106,9 @@
line: "%admin ALL=(ALL) ALL"
validate: "/usr/sbin/visudo -cf %s"
tags:
- - set-sudoers
+ - common-conf-sudoers
-- name: Set sudoers sudo
+- name: Conf - sudoers nopasswd
lineinfile:
path: "/etc/sudoers"
state: "present"
@@ -108,6 +116,6 @@
line: "%sudo ALL=(ALL:ALL) NOPASSWD: ALL"
validate: "/usr/sbin/visudo -cf %s"
tags:
- - set-sudoers
+ - common-conf-sudoers
- meta: flush_handlers
diff --git a/resources/tools/testbed-setup/ansible/roles/consul/tasks/main.yaml b/resources/tools/testbed-setup/ansible/roles/consul/tasks/main.yaml
index 9d1ca1980d..99ac52da44 100644
--- a/resources/tools/testbed-setup/ansible/roles/consul/tasks/main.yaml
+++ b/resources/tools/testbed-setup/ansible/roles/consul/tasks/main.yaml
@@ -1,11 +1,19 @@
---
# file: roles/consul/tasks/main.yaml
+- name: Inst - Update Package Cache (APT)
+ apt:
+ update_cache: yes
+ cache_valid_time: 3600
+ when:
+ - ansible_distribution|lower == 'ubuntu'
+ tags:
+ - consul-inst-prerequisites
+
- name: Inst - Prerequisites
package:
name: "{{ packages | flatten(levels=1) }}"
state: latest
- update_cache: true
tags:
- consul-inst-prerequisites
@@ -125,6 +133,16 @@
tags:
- consul-conf
+- name: Conf - Telemetry Configuration
+ template:
+ src: telemetry.hcl.j2
+ dest: "{{ consul_config_dir }}/telemetry.hcl"
+ owner: "{{ consul_user }}"
+ group: "{{ consul_group }}"
+ mode: 0644
+ tags:
+ - consul-conf
+
- name: Conf - Services Configuration
template:
src: services.json.j2
@@ -156,8 +174,8 @@
owner: "root"
group: "root"
mode: 0644
- notify:
- - "Restart Consul"
+# notify:
+# - "Restart Consul"
# - "Stop Systemd-resolved"
# - "Restart Nomad"
tags:
diff --git a/resources/tools/testbed-setup/ansible/roles/consul/templates/base.hcl.j2 b/resources/tools/testbed-setup/ansible/roles/consul/templates/base.hcl.j2
index e220c8f687..536c48d847 100644
--- a/resources/tools/testbed-setup/ansible/roles/consul/templates/base.hcl.j2
+++ b/resources/tools/testbed-setup/ansible/roles/consul/templates/base.hcl.j2
@@ -25,11 +25,11 @@ auto_encrypt {
}
{% else %}
verify_incoming = false
-verify_outgoing = true
-verify_server_hostname = true
+verify_outgoing = false
+verify_server_hostname = false
ca_file = "{{ consul_ca_file }}"
auto_encrypt {
- tls = true
+ tls = false
}
{% endif %}
{% if consul_retry_join | bool -%}
diff --git a/resources/tools/testbed-setup/ansible/roles/consul/templates/telemetry.hcl.j2 b/resources/tools/testbed-setup/ansible/roles/consul/templates/telemetry.hcl.j2
new file mode 100644
index 0000000000..ec7fabc9da
--- /dev/null
+++ b/resources/tools/testbed-setup/ansible/roles/consul/templates/telemetry.hcl.j2
@@ -0,0 +1,3 @@
+telemetry {
+ prometheus_retention_time = "24h"
+} \ No newline at end of file
diff --git a/resources/tools/testbed-setup/ansible/roles/csit_shim_image/files/Dockerfile b/resources/tools/testbed-setup/ansible/roles/csit_shim_image/files/Dockerfile
deleted file mode 100644
index 2b2e1eae55..0000000000
--- a/resources/tools/testbed-setup/ansible/roles/csit_shim_image/files/Dockerfile
+++ /dev/null
@@ -1,61 +0,0 @@
-# Copyright (c) 2020 Cisco and/or its affiliates.
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at:
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-FROM ubuntu:18.04
-LABEL Description="CSIT vpp-device ubuntu 18.04 shim image"
-LABEL Version="master"
-
-# Setup the environment
-ENV DEBIAN_FRONTEND=noninteractive
-ENV NOTVISIBLE "in users profile"
-RUN echo "export VISIBLE=now" >> /etc/profile
-
-ADD files/wrapdocker /usr/local/bin/wrapdocker
-RUN chmod +x /usr/local/bin/wrapdocker
-
-# Install packages and Docker
-RUN apt-get -q update \
- && apt-get install -y -qq \
- bash \
- curl \
- iproute2 \
- locales \
- ssh \
- sudo \
- tzdata \
- uuid-runtime \
- && curl -fsSL https://get.docker.com | sh \
- && rm -rf /var/lib/apt/lists/*
-
-# Configure locales
-RUN locale-gen en_US
-
-RUN mkdir /var/run/sshd
-RUN echo 'root:Csit1234' | chpasswd
-RUN sed -i 's/#PermitRootLogin prohibit-password/PermitRootLogin yes/' /etc/ssh/sshd_config
-
-# SSH login fix. Otherwise user is kicked off after login
-RUN sed 's@session\s*required\s*pam_loginuid.so@session optional pam_loginuid.so@g' -i /etc/pam.d/sshd
-
-# Need volume for sidecar docker launches
-VOLUME /var/lib/docker
-
-# SSH to listen on port 6022 in shim
-RUN echo 'Port 6022' >>/etc/ssh/sshd_config
-RUN echo 'Port 6023' >>/etc/ssh/sshd_config
-ADD files/badkeypub /root/.ssh/authorized_keys
-ADD files/sshconfig /root/.ssh/config
-
-# Start sshd by default
-EXPOSE 22
-CMD ["/usr/sbin/sshd", "-D"] \ No newline at end of file
diff --git a/resources/tools/testbed-setup/ansible/roles/csit_shim_image/files/files/badkeypub b/resources/tools/testbed-setup/ansible/roles/csit_shim_image/files/files/badkeypub
deleted file mode 100644
index 4530b66b05..0000000000
--- a/resources/tools/testbed-setup/ansible/roles/csit_shim_image/files/files/badkeypub
+++ /dev/null
@@ -1 +0,0 @@
-ssh-rsa AAAAB3NzaC1yc2EAAAADAQABAAABAQCyUNd/iRk5Ajw4ZBB0gXyjzecEzQHh/MctgvHGJjasqJDkwYyZBrunUorOZ3n82W8VGdd5+eNINCWOM/ERjuaHjnutfade+ocPgZRdk+kEgTvetDVNWIgBd0PMVcnp57jJfx7CZVqTNgGeVQ8OJ2RbJGeOb/EKApQI74IPkAfc0PSieSw5gC0eqEOHb39Awgp0ycrzsUHF/OEicfCmo+6vvrMGenDe7frKUoTKYMWs7l3DOyFC8NaOxhGD3J1Ne5u3A/r4w6mN1HVI0rFwIcoms+t0B4lb2ODWKZiZikQdn8/eqwsmbSEZZsWN3FkshgjPS83+dNqVwB6pPY5Yqte7 ejk@bhima.local \ No newline at end of file
diff --git a/resources/tools/testbed-setup/ansible/roles/csit_shim_image/files/files/sshconfig b/resources/tools/testbed-setup/ansible/roles/csit_shim_image/files/files/sshconfig
deleted file mode 100644
index e7bd90757e..0000000000
--- a/resources/tools/testbed-setup/ansible/roles/csit_shim_image/files/files/sshconfig
+++ /dev/null
@@ -1,3 +0,0 @@
-Host 172.17.0.*
- StrictHostKeyChecking no
- UserKnownHostsFile=/dev/null \ No newline at end of file
diff --git a/resources/tools/testbed-setup/ansible/roles/csit_shim_image/files/files/wrapdocker b/resources/tools/testbed-setup/ansible/roles/csit_shim_image/files/files/wrapdocker
deleted file mode 100644
index d13f8b7c5e..0000000000
--- a/resources/tools/testbed-setup/ansible/roles/csit_shim_image/files/files/wrapdocker
+++ /dev/null
@@ -1,113 +0,0 @@
-#!/bin/bash
-
-# Ensure that all nodes in /dev/mapper correspond to mapped devices currently loaded by the device-mapper kernel driver
-dmsetup mknodes
-
-# First, make sure that cgroups are mounted correctly.
-CGROUP=/sys/fs/cgroup
-: {LOG:=stdio}
-
-[ -d $CGROUP ] ||
- mkdir $CGROUP
-
-mountpoint -q $CGROUP ||
- mount -n -t tmpfs -o uid=0,gid=0,mode=0755 cgroup $CGROUP || {
- echo "Could not make a tmpfs mount. Did you use --privileged?"
- exit 1
- }
-
-if [ -d /sys/kernel/security ] && ! mountpoint -q /sys/kernel/security
-then
- mount -t securityfs none /sys/kernel/security || {
- echo "Could not mount /sys/kernel/security."
- echo "AppArmor detection and --privileged mode might break."
- }
-fi
-
-# Mount the cgroup hierarchies exactly as they are in the parent system.
-for SUBSYS in $(cut -d: -f2 /proc/1/cgroup)
-do
- [ -d $CGROUP/$SUBSYS ] || mkdir $CGROUP/$SUBSYS
- mountpoint -q $CGROUP/$SUBSYS ||
- mount -n -t cgroup -o $SUBSYS cgroup $CGROUP/$SUBSYS
-
- # The two following sections address a bug which manifests itself
- # by a cryptic "lxc-start: no ns_cgroup option specified" when
- # trying to start containers withina container.
- # The bug seems to appear when the cgroup hierarchies are not
- # mounted on the exact same directories in the host, and in the
- # container.
-
- # Named, control-less cgroups are mounted with "-o name=foo"
- # (and appear as such under /proc/<pid>/cgroup) but are usually
- # mounted on a directory named "foo" (without the "name=" prefix).
- # Systemd and OpenRC (and possibly others) both create such a
- # cgroup. To avoid the aforementioned bug, we symlink "foo" to
- # "name=foo". This shouldn't have any adverse effect.
- echo $SUBSYS | grep -q ^name= && {
- NAME=$(echo $SUBSYS | sed s/^name=//)
- ln -s $SUBSYS $CGROUP/$NAME
- }
-
- # Likewise, on at least one system, it has been reported that
- # systemd would mount the CPU and CPU accounting controllers
- # (respectively "cpu" and "cpuacct") with "-o cpuacct,cpu"
- # but on a directory called "cpu,cpuacct" (note the inversion
- # in the order of the groups). This tries to work around it.
- [ $SUBSYS = cpuacct,cpu ] && ln -s $SUBSYS $CGROUP/cpu,cpuacct
-done
-
-# Note: as I write those lines, the LXC userland tools cannot setup
-# a "sub-container" properly if the "devices" cgroup is not in its
-# own hierarchy. Let's detect this and issue a warning.
-grep -q :devices: /proc/1/cgroup ||
- echo "WARNING: the 'devices' cgroup should be in its own hierarchy."
-grep -qw devices /proc/1/cgroup ||
- echo "WARNING: it looks like the 'devices' cgroup is not mounted."
-
-# Now, close extraneous file descriptors.
-pushd /proc/self/fd >/dev/null
-for FD in *
-do
- case "$FD" in
- # Keep stdin/stdout/stderr
- [012])
- ;;
- # Nuke everything else
- *)
- eval exec "$FD>&-"
- ;;
- esac
-done
-popd >/dev/null
-
-
-# If a pidfile is still around (for example after a container restart),
-# delete it so that docker can start.
-rm -rf /var/run/docker.pid
-
-# If we were given a PORT environment variable, start as a simple daemon;
-# otherwise, spawn a shell as well
-if [ "$PORT" ]
-then
- exec dockerd -H 0.0.0.0:$PORT -H unix:///var/run/docker.sock \
- $DOCKER_DAEMON_ARGS
-else
- if [ "$LOG" == "file" ]
- then
- dockerd $DOCKER_DAEMON_ARGS &>/var/log/docker.log &
- else
- dockerd $DOCKER_DAEMON_ARGS &
- fi
- (( timeout = 60 + SECONDS ))
- until docker info >/dev/null 2>&1
- do
- if (( SECONDS >= timeout )); then
- echo 'Timed out trying to connect to internal docker host.' >&2
- break
- fi
- sleep 1
- done
- [[ $1 ]] && exec "$@"
- exec bash --login
-fi \ No newline at end of file
diff --git a/resources/tools/testbed-setup/ansible/roles/csit_shim_image/tasks/main.yaml b/resources/tools/testbed-setup/ansible/roles/csit_shim_image/tasks/main.yaml
deleted file mode 100644
index bdba4f6563..0000000000
--- a/resources/tools/testbed-setup/ansible/roles/csit_shim_image/tasks/main.yaml
+++ /dev/null
@@ -1,32 +0,0 @@
----
-# file: roles/csit_shim_image/tasks/main.yaml
-
-- name: Create a directory if it does not exist
- file:
- path: "{{ item }}"
- state: "directory"
- mode: 0755
- with_items:
- - "/opt/csit-shim/"
- - "/opt/csit-shim/files"
- tags: csit-shim-image
-
-- name: Copy Build Items
- copy:
- src: "{{ item }}"
- dest: "/opt/csit-shim/{{ item }}"
- owner: "root"
- group: "root"
- mode: 0655
- with_items:
- - "Dockerfile"
- - "files/badkeypub"
- - "files/sshconfig"
- - "files/wrapdocker"
- tags: csit-shim-image
-
-- name: Build CSIT shim Docker Image
- shell: "docker build -t csit_shim-ubuntu1804:local ."
- args:
- chdir: "/opt/csit-shim"
- tags: csit-shim-image \ No newline at end of file
diff --git a/resources/tools/testbed-setup/ansible/roles/csit_sut_image/files/Dockerfile b/resources/tools/testbed-setup/ansible/roles/csit_sut_image/files/Dockerfile
index 6dddad6ebb..73ff5c5e86 100644
--- a/resources/tools/testbed-setup/ansible/roles/csit_sut_image/files/Dockerfile
+++ b/resources/tools/testbed-setup/ansible/roles/csit_sut_image/files/Dockerfile
@@ -1,4 +1,4 @@
-# Copyright (c) 2020 Cisco and/or its affiliates.
+# Copyright (c) 2021 Cisco and/or its affiliates.
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at:
@@ -11,55 +11,54 @@
# See the License for the specific language governing permissions and
# limitations under the License.
-FROM ubuntu:18.04
-LABEL Description="CSIT vpp-device ubuntu 18.04 SUT image"
+FROM ubuntu:20.04
+LABEL Description="CSIT vpp-device ubuntu 20.04 SUT image"
LABEL Version="master"
# Setup the environment
ENV DEBIAN_FRONTEND=noninteractive
-ENV LANG='en_US.UTF-8' LANGUAGE='en_US:en' LC_ALL='en_US.UTF-8'
-ENV NOTVISIBLE "in users profile"
-ENV VPP_PYTHON_PREFIX=/var/cache/vpp/python
+
+# Configure locales
+RUN apt-get update -qq \
+ && apt-get install -y \
+ apt-utils \
+ locales \
+ && sed -i 's/# \(en_US\.UTF-8 .*\)/\1/' /etc/locale.gen \
+ && locale-gen en_US.UTF-8 \
+ && dpkg-reconfigure --frontend=noninteractive locales \
+ && update-locale LANG=en_US.UTF-8 \
+ && TZ=Etc/UTC && ln -snf /usr/share/zoneinfo/$TZ /etc/localtime && echo $TZ > /etc/timezone \
+ && rm -r /var/lib/apt/lists/*
+ENV LANG="en_US.UTF-8" LANGUAGE="en_US" LC_ALL="en_US.UTF-8"
# Install packages and Docker
RUN apt-get -q update \
&& apt-get install -y -qq \
- # general tools
apt-transport-https \
bridge-utils \
+ ca-certificates \
cloud-init \
- curl \
- gdb \
- locales \
- net-tools \
- openssh-server \
- pciutils \
- rsyslog \
- software-properties-common \
- ssh \
- sudo \
- supervisor \
- tar \
- vim \
- wget \
- # csit requirements
cmake \
+ curl \
dkms \
+ gdb \
gfortran \
+ libapr1 \
libblas-dev \
libffi-dev \
liblapack-dev \
+ libmbedcrypto3 \
+ libmbedtls12 \
+ libmbedx509-0 \
+ libnuma1 \
+ libnuma-dev \
libpcap-dev \
+ libpixman-1-dev \
libssl-dev \
- python-all \
- python-apt \
- python-cffi \
- python-cffi-backend \
- python-dev \
- python-enum34 \
- python-pip \
- python-setuptools \
- python-virtualenv \
+ locales \
+ net-tools \
+ openssh-server \
+ pciutils \
python3-all \
python3-apt \
python3-cffi \
@@ -69,30 +68,23 @@ RUN apt-get -q update \
python3-setuptools \
python3-virtualenv \
qemu-system \
+ rsyslog \
socat \
+ software-properties-common \
strongswan \
- unzip \
+ ssh \
+ sshpass \
+ sudo \
+ supervisor \
+ tar \
tcpdump \
+ unzip \
+ vim \
+ wget \
zlib1g-dev \
- # vpp requirements
- ca-certificates \
- libapr1 \
- libmbedcrypto1 \
- libmbedtls10 \
- libmbedx509-0 \
- libnuma1 \
- sshpass \
- && curl -L https://packagecloud.io/fdio/master/gpgkey | sudo apt-key add - \
- && curl -s https://packagecloud.io/install/repositories/fdio/master/script.deb.sh | sudo bash \
- # temp hack due to build.sh
- && apt-get install -y -qq vpp-ext-deps \
&& curl -fsSL https://get.docker.com | sh \
&& rm -rf /var/lib/apt/lists/*
-# Configure locales
-RUN locale-gen en_US.UTF-8 \
- && dpkg-reconfigure locales
-
# Fix permissions
RUN chown root:syslog /var/log \
&& chmod 755 /etc/default
@@ -113,7 +105,7 @@ RUN pip3 install \
robotframework==3.1.2 \
scapy==2.4.3 \
scp==0.13.2 \
- ansible==2.7.8 \
+ ansible==2.10.7 \
dill==0.2.8.2 \
numpy==1.17.3 \
hdrhistogram==0.6.1 \
@@ -123,6 +115,7 @@ RUN pip3 install \
sphinx-rtd-theme==0.4.0 \
sphinxcontrib-programoutput==0.15 \
sphinxcontrib-robotdoc==0.11.0 \
+ ply==3.11 \
alabaster==0.7.12 \
Babel==2.7.0 \
bcrypt==3.1.7 \
@@ -158,7 +151,7 @@ RUN pip3 install \
# ARM workaround
RUN pip3 install \
pandas==0.25.3 \
- scipy==1.1.0
+ scipy==1.5.4
# SSH settings
RUN echo 'root:Csit1234' | chpasswd \
diff --git a/resources/tools/testbed-setup/ansible/roles/csit_sut_image/tasks/main.yaml b/resources/tools/testbed-setup/ansible/roles/csit_sut_image/tasks/main.yaml
index 7b41be8a64..2affe4b18e 100644
--- a/resources/tools/testbed-setup/ansible/roles/csit_sut_image/tasks/main.yaml
+++ b/resources/tools/testbed-setup/ansible/roles/csit_sut_image/tasks/main.yaml
@@ -6,7 +6,8 @@
path: "/opt/csit-sut/"
state: "directory"
mode: 0755
- tags: csit-sut-image
+ tags:
+ - csit-sut-image
- name: Copy Build Items
copy:
@@ -18,10 +19,12 @@
with_items:
- Dockerfile
- supervisord.conf
- tags: csit-sut-image
+ tags:
+ - csit-sut-image
- name: Build CSIT SUT Docker Image
- shell: "docker build -t csit_sut-ubuntu1804:local ."
+ shell: "docker build -t csit_sut-ubuntu2004:local ."
args:
chdir: "/opt/csit-sut"
- tags: csit-sut-image \ No newline at end of file
+ tags:
+ - csit-sut-image \ No newline at end of file
diff --git a/resources/tools/testbed-setup/ansible/roles/docker/defaults/main.yaml b/resources/tools/testbed-setup/ansible/roles/docker/defaults/main.yaml
index 9b182de4c4..8343558238 100644
--- a/resources/tools/testbed-setup/ansible/roles/docker/defaults/main.yaml
+++ b/resources/tools/testbed-setup/ansible/roles/docker/defaults/main.yaml
@@ -21,8 +21,8 @@ docker_apt_gpg_key_state: present
# Used only for RedHat/CentOS/Fedora.
docker_yum_repo_url: https://download.docker.com/linux/{{ (ansible_distribution == "Fedora") | ternary("fedora","centos") }}/docker-{{ docker_edition }}.repo
-docker_yum_repo_enable_edge: '0'
-docker_yum_repo_enable_test: '0'
+docker_yum_repo_enable_edge: "0"
+docker_yum_repo_enable_test: "0"
docker_yum_gpg_key: https://download.docker.com/linux/centos/gpg
# A list of users who will be added to the docker group.
diff --git a/resources/tools/testbed-setup/ansible/roles/docker/handlers/main.yaml b/resources/tools/testbed-setup/ansible/roles/docker/handlers/main.yaml
index 4dfcd0b867..d89adb9a1a 100644
--- a/resources/tools/testbed-setup/ansible/roles/docker/handlers/main.yaml
+++ b/resources/tools/testbed-setup/ansible/roles/docker/handlers/main.yaml
@@ -5,3 +5,5 @@
service:
name: "docker"
state: "{{ docker_restart_handler_state }}"
+ tags:
+ - docker-restart-service \ No newline at end of file
diff --git a/resources/tools/testbed-setup/ansible/roles/docker/tasks/main.yaml b/resources/tools/testbed-setup/ansible/roles/docker/tasks/main.yaml
index 8158af51b4..5a96b7a7c5 100644
--- a/resources/tools/testbed-setup/ansible/roles/docker/tasks/main.yaml
+++ b/resources/tools/testbed-setup/ansible/roles/docker/tasks/main.yaml
@@ -29,6 +29,18 @@
tags:
- docker-conf-service
+- name: Conf - Docker Daemon
+ template:
+ src: "templates/daemon.json.j2"
+ dest: "/etc/docker/daemon.json"
+ owner: "root"
+ group: "root"
+ mode: "0644"
+ when: >
+ docker_daemon is defined
+ tags:
+ - docker-conf-daemon
+
- name: Conf - Docker HTTP Proxy
template:
src: "templates/docker.service.proxy.http"
diff --git a/resources/tools/testbed-setup/ansible/roles/docker/tasks/ubuntu_bionic.yaml b/resources/tools/testbed-setup/ansible/roles/docker/tasks/ubuntu_bionic.yaml
index 2e82c552be..8bda4fed21 100644
--- a/resources/tools/testbed-setup/ansible/roles/docker/tasks/ubuntu_bionic.yaml
+++ b/resources/tools/testbed-setup/ansible/roles/docker/tasks/ubuntu_bionic.yaml
@@ -11,17 +11,20 @@
state: "present"
cache_valid_time: 3600
install_recommends: False
- tags: docker-inst-dependencies
+ tags:
+ - docker-inst-dependencies
- name: Conf - Add APT Key
apt_key:
url: "{{ docker_apt_gpg_key }}"
state: "{{ docker_apt_gpg_key_state }}"
- tags: docker-conf-apt
+ tags:
+ - docker-conf-apt
- name: Conf - Install APT Repository
apt_repository:
repo: "{{ docker_apt_repository }}"
state: "{{ docker_apt_repository_state }}"
- update_cache: True
- tags: docker-conf-apt
+ update_cache: yes
+ tags:
+ - docker-conf-apt
diff --git a/resources/tools/testbed-setup/ansible/roles/docker/tasks/ubuntu_focal.yaml b/resources/tools/testbed-setup/ansible/roles/docker/tasks/ubuntu_focal.yaml
new file mode 100644
index 0000000000..84bd1c5824
--- /dev/null
+++ b/resources/tools/testbed-setup/ansible/roles/docker/tasks/ubuntu_focal.yaml
@@ -0,0 +1,30 @@
+---
+# file: roles/docker/tasks/ubuntu_focal.yaml
+
+- name: Inst - Dependencies
+ apt:
+ name:
+ - "apt-transport-https"
+ - "ca-certificates"
+ - "gpg-agent"
+ - "software-properties-common"
+ state: "present"
+ cache_valid_time: 3600
+ install_recommends: False
+ tags:
+ - docker-inst-dependencies
+
+- name: Conf - Add APT Key
+ apt_key:
+ url: "{{ docker_apt_gpg_key }}"
+ state: "{{ docker_apt_gpg_key_state }}"
+ tags:
+ - docker-conf-apt
+
+- name: Conf - Install APT Repository
+ apt_repository:
+ repo: "{{ docker_apt_repository }}"
+ state: "{{ docker_apt_repository_state }}"
+ update_cache: yes
+ tags:
+ - docker-conf-apt
diff --git a/resources/tools/testbed-setup/ansible/roles/docker/templates/daemon.json.j2 b/resources/tools/testbed-setup/ansible/roles/docker/templates/daemon.json.j2
new file mode 100644
index 0000000000..becc2b1af7
--- /dev/null
+++ b/resources/tools/testbed-setup/ansible/roles/docker/templates/daemon.json.j2
@@ -0,0 +1 @@
+{{ docker_daemon | to_nice_json }} \ No newline at end of file
diff --git a/resources/tools/testbed-setup/ansible/roles/dpdk/defaults/main.yaml b/resources/tools/testbed-setup/ansible/roles/dpdk/defaults/main.yaml
index e43c28403b..2a8c691728 100644
--- a/resources/tools/testbed-setup/ansible/roles/dpdk/defaults/main.yaml
+++ b/resources/tools/testbed-setup/ansible/roles/dpdk/defaults/main.yaml
@@ -1,15 +1,19 @@
---
# file: roles/dpdk/defaults/main.yaml
-packages: "{{ packages_base + packages_by_distro[ansible_distribution | lower] + packages_by_arch[ansible_machine] }}"
+packages: "{{ packages_base + packages_by_distro[ansible_distribution|lower][ansible_distribution_release] + packages_by_arch[ansible_machine] }}"
packages_base:
- []
packages_by_distro:
ubuntu:
- - "build-essential"
- - "libnuma-dev"
+ bionic:
+ - "build-essential"
+ - "libnuma-dev"
+ focal:
+ - "build-essential"
+ - "libnuma-dev"
packages_by_arch:
aarch64:
@@ -19,13 +23,9 @@ packages_by_arch:
dpdk_target_dir: "/opt"
dpdk_version:
- - "19.02"
- "20.02"
dpdk_url: "https://fast.dpdk.org/rel"
dpdk_build_targets:
- "19.02":
- aarch64: "arm64-armv8a-linuxapp-gcc"
- x86_64: "x86_64-native-linuxapp-gcc"
"20.02":
aarch64: "arm64-armv8a-linux-gcc"
x86_64: "x86_64-native-linux-gcc"
diff --git a/resources/tools/testbed-setup/ansible/roles/dpdk/tasks/main.yaml b/resources/tools/testbed-setup/ansible/roles/dpdk/tasks/main.yaml
index 1b4e398e26..46f942be93 100644
--- a/resources/tools/testbed-setup/ansible/roles/dpdk/tasks/main.yaml
+++ b/resources/tools/testbed-setup/ansible/roles/dpdk/tasks/main.yaml
@@ -1,15 +1,23 @@
---
# file: roles/dpdk/tasks/main.yaml
-- name: DPDK - Install Distribution - Release - Machine Prerequisites
+- name: Inst - Update Package Cache (APT)
+ apt:
+ update_cache: yes
+ cache_valid_time: 3600
+ when:
+ - ansible_distribution|lower == 'ubuntu'
+ tags:
+ - dpdk-inst-prerequisites
+
+- name: Inst - Prerequisites
package:
name: "{{ packages | flatten(levels=1) }}"
state: latest
- update_cache: true
tags:
- - install-dependencies
+ - dpdk-inst-prerequisites
-- name: DPDK - Download Release Archive
+- name: Download Release Archive
get_url:
url: "{{ dpdk_url }}/dpdk-{{ item }}.tar.xz"
dest: "{{ dpdk_target_dir }}/dpdk-{{ item }}.tar.xz"
@@ -17,9 +25,9 @@
loop: "{{ dpdk_version }}"
register: "dpdk_downloaded"
tags:
- - install-dpdk
+ - dpdk-inst
-- name: DPDK - Extract Release Archive
+- name: Extract Release Archive
unarchive:
remote_src: true
src: "{{ dpdk_target_dir }}/dpdk-{{ item }}.tar.xz"
@@ -29,9 +37,9 @@
when: "dpdk_downloaded"
register: "dpdk_extracted"
tags:
- - install-dpdk
+ - dpdk-inst
-- name: DPDK - Build igb_uio by default
+- name: Build igb_uio by default
lineinfile:
dest: "{{ dpdk_target_dir }}/dpdk-{{ item }}/config/common_base"
regexp: "^CONFIG_RTE_EAL_IGB_UIO"
@@ -40,21 +48,21 @@
when: "dpdk_extracted"
register: "dpdk_configured"
tags:
- - install-dpdk
+ - dpdk-inst
-- name: DPDK - Compile Release I
+- name: Compile Release I
become: yes
command: "make install T={{ dpdk_build_targets[item][ansible_machine] }} DESTDIR={{ dpdk_target_dir }}/dpdk-{{ item }} chdir={{ dpdk_target_dir }}/dpdk-{{ item }}"
loop: "{{ dpdk_version }}"
when: "dpdk_configured"
register: "dpdk_compiled"
tags:
- - install-dpdk
+ - dpdk-inst
-- name: DPDK - Link igb_uio Module
+- name: Link igb_uio Module
shell: "ln -fs {{ dpdk_target_dir }}/dpdk-{{ item }}/{{ dpdk_build_targets[item][ansible_machine] }}/kmod/igb_uio.ko /lib/modules/`uname -r`/igb_uio.ko && depmod -a"
ignore_errors: "yes"
loop: "{{ dpdk_version }}"
when: "dpdk_compiled"
tags:
- - install-dpdk
+ - dpdk-inst \ No newline at end of file
diff --git a/resources/tools/testbed-setup/ansible/roles/iperf/defaults/main.yaml b/resources/tools/testbed-setup/ansible/roles/iperf/defaults/main.yaml
index 50210660f4..07af60b63a 100644
--- a/resources/tools/testbed-setup/ansible/roles/iperf/defaults/main.yaml
+++ b/resources/tools/testbed-setup/ansible/roles/iperf/defaults/main.yaml
@@ -1,15 +1,19 @@
---
# file: roles/iperf/defaults/main.yaml
-packages: "{{ packages_base + packages_by_distro[ansible_distribution | lower] + packages_by_arch[ansible_machine] }}"
+packages: "{{ packages_base + packages_by_distro[ansible_distribution|lower][ansible_distribution_release] + packages_by_arch[ansible_machine] }}"
packages_base:
- []
packages_by_distro:
ubuntu:
- - "build-essential"
- - "lib32z1"
+ bionic:
+ - "build-essential"
+ - "lib32z1"
+ focal:
+ - "build-essential"
+ - "lib32z1"
packages_by_arch:
aarch64:
diff --git a/resources/tools/testbed-setup/ansible/roles/iperf/tasks/main.yaml b/resources/tools/testbed-setup/ansible/roles/iperf/tasks/main.yaml
index 8233ba7113..f8948cae57 100644
--- a/resources/tools/testbed-setup/ansible/roles/iperf/tasks/main.yaml
+++ b/resources/tools/testbed-setup/ansible/roles/iperf/tasks/main.yaml
@@ -1,15 +1,23 @@
---
# file: roles/iperf/tasks/main.yaml
-- name: iPerf - Install Distribution - Release - Machine Prerequisites
+- name: Inst - Update Package Cache (APT)
+ apt:
+ update_cache: yes
+ cache_valid_time: 3600
+ when:
+ - ansible_distribution|lower == 'ubuntu'
+ tags:
+ - iperf-inst-prerequisites
+
+- name: Inst - Prerequisites
package:
name: "{{ packages | flatten(levels=1) }}"
state: latest
- update_cache: true
tags:
- - install-dependencies
+ - iperf-inst-prerequisites
-- name: iPerf - Get Release Archive
+- name: Get Release Archive
get_url:
url: "https://downloads.es.net/pub/iperf/iperf-{{ item }}.tar.gz"
dest: "{{ iperf_target_dir }}/iperf-{{ item }}.tar.gz"
@@ -17,9 +25,9 @@
mode: 0644
loop: "{{ iperf_version }}"
tags:
- - install-iperf
+ - iperf-inst
-- name: iPerf - Extract Release Archive
+- name: Extract Release Archive
unarchive:
remote_src: true
src: "{{ iperf_target_dir }}/iperf-{{ item }}.tar.gz"
@@ -27,28 +35,28 @@
creates: "{{ iperf_target_dir }}/iperf-{{ item }}/src"
loop: "{{ iperf_version }}"
tags:
- - install-iperf
+ - iperf-inst
-- name: iPerf - Compile Release I
+- name: Compile Release I
command: "./configure"
args:
chdir: "{{ iperf_target_dir }}/iperf-{{ item }}/"
loop: "{{ iperf_version }}"
tags:
- - install-iperf
+ - iperf-inst
-- name: iPerf - Compile Release II
+- name: Compile Release II
command: "make"
args:
chdir: "{{ iperf_target_dir }}/iperf-{{ item }}/"
loop: "{{ iperf_version }}"
tags:
- - install-iperf
+ - iperf-inst
-- name: iPerf - Compile Release III
+- name: Compile Release III
command: "make install"
args:
chdir: "{{ iperf_target_dir }}/iperf-{{ item }}/"
loop: "{{ iperf_version }}"
tags:
- - install-iperf
+ - iperf-inst \ No newline at end of file
diff --git a/resources/tools/testbed-setup/ansible/roles/jenkins_job_health_exporter/defaults/main.yaml b/resources/tools/testbed-setup/ansible/roles/jenkins_job_health_exporter/defaults/main.yaml
new file mode 100644
index 0000000000..9813d41afb
--- /dev/null
+++ b/resources/tools/testbed-setup/ansible/roles/jenkins_job_health_exporter/defaults/main.yaml
@@ -0,0 +1,35 @@
+---
+# file: roles/jenkins_job_health_exporter/defaults/main.yaml
+
+# Conf - Jenkins Job Health Exporter.
+jenkins_host: "jenkins.fd.io"
+poll_interval_sec: 1800
+req_timeout_sec: 30
+bind_to: "0.0.0.0:9186"
+last_builds: 10
+jobs:
+ - "vpp-csit-verify-api-crc-master"
+ - "vpp-beta-verify-master-ubuntu2004-aarch64"
+ - "vpp-verify-master-centos8-aarch64"
+ - "vpp-verify-master-ubuntu1804-aarch64"
+ - "vpp-gcc-verify-master-ubuntu2004-x86_64"
+ - "vpp-verify-master-centos8-x86_64"
+ - "vpp-verify-master-debian10-x86_64"
+ - "vpp-verify-master-ubuntu2004-x86_64"
+ - "vpp-verify-master-ubuntu1804-x86_64"
+ - "vpp-debug-verify-master-ubuntu2004-x86_64"
+ - "vpp-checkstyle-verify-master-ubuntu2004-x86_64"
+ - "vpp-sphinx-docs-verify-master-ubuntu1804-x86_64"
+ - "vpp-docs-verify-master-ubuntu1804-x86_64"
+ - "vpp-make-test-docs-verify-master-ubuntu1804-x86_64"
+ - "vpp-csit-verify-device-master-1n-skx"
+ - "vpp-csit-verify-device-master-1n-tx2"
+
+# Conf - Service.
+jenkins_job_health_exporter_restart_handler_state: "restarted"
+
+# Inst - System paths.
+jenkins_job_health_exporter_target_dir: "/usr/bin"
+jenkins_job_health_exporter_conf_dir: "/etc"
+jenkins_job_health_exporter_url: "https://github.com/ayourtch/jenkins-job-health-exporter/releases/download"
+jenkins_job_health_exporter_version: "v0.0.3" \ No newline at end of file
diff --git a/resources/tools/testbed-setup/ansible/roles/jenkins_job_health_exporter/handlers/main.yaml b/resources/tools/testbed-setup/ansible/roles/jenkins_job_health_exporter/handlers/main.yaml
new file mode 100644
index 0000000000..29fee98fed
--- /dev/null
+++ b/resources/tools/testbed-setup/ansible/roles/jenkins_job_health_exporter/handlers/main.yaml
@@ -0,0 +1,9 @@
+---
+# file roles/jenkins_job_health_exporter/handlers/main.yaml
+
+- name: Restart Jenkins Job Health Exporter
+ systemd:
+ daemon_reload: true
+ enabled: true
+ name: "jenkins-job-health-exporter"
+ state: "{{ jenkins_job_health_exporter_restart_handler_state }}"
diff --git a/resources/tools/testbed-setup/ansible/roles/jenkins_job_health_exporter/tasks/main.yaml b/resources/tools/testbed-setup/ansible/roles/jenkins_job_health_exporter/tasks/main.yaml
new file mode 100644
index 0000000000..5dbe476019
--- /dev/null
+++ b/resources/tools/testbed-setup/ansible/roles/jenkins_job_health_exporter/tasks/main.yaml
@@ -0,0 +1,38 @@
+---
+# file: roles/jenkins_job_health_exporter/tasks/main.yaml
+
+- name: Conf - Jenkins Job Health Exporter Config
+ template:
+ src: "templates/jenkins-job-health-exporter.j2"
+ dest: "/etc/jenkins-job-health-exporter.json"
+ owner: "root"
+ group: "root"
+ mode: "0644"
+ when:
+ - ansible_hostname == "s42-nomad"
+ tags:
+ - conf-jenkins-job-json
+
+- name: Inst - Jenkins Job Health Exporter Binary
+ get_url:
+ url: "{{ jenkins_job_health_exporter_url }}/{{ jenkins_job_health_exporter_version }}/jenkins-job-health-exporter"
+ dest: "{{ jenkins_job_health_exporter_target_dir }}/jenkins-job-health-exporter"
+ mode: "0755"
+ when:
+ - ansible_hostname == "s42-nomad"
+ tags:
+ - inst-jenkins-job-binary
+
+- name: Inst - Jenkins Job Health Exporter Service
+ template:
+ src: "templates/jenkins-job-health-exporter.service.j2"
+ dest: "/lib/systemd/system/jenkins-job-health-exporter.service"
+ owner: "root"
+ group: "root"
+ mode: "0644"
+ when:
+ - ansible_hostname == "s42-nomad"
+ notify:
+ - "Restart Jenkins Job Health Exporter"
+ tags:
+ - inst-jenkins-job-service
diff --git a/resources/tools/testbed-setup/ansible/roles/jenkins_job_health_exporter/templates/jenkins-job-health-exporter.j2 b/resources/tools/testbed-setup/ansible/roles/jenkins_job_health_exporter/templates/jenkins-job-health-exporter.j2
new file mode 100644
index 0000000000..5942b782e0
--- /dev/null
+++ b/resources/tools/testbed-setup/ansible/roles/jenkins_job_health_exporter/templates/jenkins-job-health-exporter.j2
@@ -0,0 +1,16 @@
+{
+ "jenkins_host": "{{ jenkins_host }}",
+ "poll_interval_sec": {{ poll_interval_sec }},
+ "req_timeout_sec": {{ req_timeout_sec }},
+ "bind_to": "{{ bind_to }}",
+ "last_builds": {{ last_builds }},
+ "jobs": [
+{% for item in jobs %}
+ "{{ item }}"
+{%- if not loop.last %},
+{% endif %}
+{% endfor %}
+
+ ],
+ "verbose": 3
+} \ No newline at end of file
diff --git a/resources/tools/testbed-setup/ansible/roles/jenkins_job_health_exporter/templates/jenkins-job-health-exporter.service.j2 b/resources/tools/testbed-setup/ansible/roles/jenkins_job_health_exporter/templates/jenkins-job-health-exporter.service.j2
new file mode 100644
index 0000000000..38073d0a8c
--- /dev/null
+++ b/resources/tools/testbed-setup/ansible/roles/jenkins_job_health_exporter/templates/jenkins-job-health-exporter.service.j2
@@ -0,0 +1,13 @@
+[Unit]
+Description=Jenkins Job Health Exporter
+Documentation=https://github.com/ayourtch/jenkins-job-health-exporter
+
+[Service]
+Restart=always
+ExecStart={{ jenkins_job_health_exporter_target_dir }}/jenkins-job-health-exporter {{ jenkins_job_health_exporter_conf_dir }}/jenkins-job-health-exporter.json
+ExecReload=/bin/kill -HUP $MAINPID
+TimeoutStopSec=20s
+SendSIGKILL=no
+
+[Install]
+WantedBy=multi-user.target \ No newline at end of file
diff --git a/resources/tools/testbed-setup/ansible/roles/kernel/defaults/main.yaml b/resources/tools/testbed-setup/ansible/roles/kernel/defaults/main.yaml
index d84a163487..b9b4253622 100644
--- a/resources/tools/testbed-setup/ansible/roles/kernel/defaults/main.yaml
+++ b/resources/tools/testbed-setup/ansible/roles/kernel/defaults/main.yaml
@@ -2,27 +2,42 @@
# file: roles/kernel/defaults/main.yaml
# Kernel version to install (Default to any version).
-kernel_version: "{{ kernel_version_by_distro[ansible_distribution | lower] | join(' ') }}"
+kernel_version: "{{ kernel_version_by_distro[ansible_distribution|lower][ansible_distribution_release] | join(' ') }}"
kernel_version_by_distro:
ubuntu:
- - "4.15.0-72"
+ bionic:
+ - "4.15.0-72"
+ focal:
+ - "5.4.0-65"
-kernel_packages: "{{ kernel_packages_by_distro[ansible_distribution | lower] | flatten(levels=1) }}"
+kernel_packages: "{{ kernel_packages_by_distro[ansible_distribution|lower][ansible_distribution_release] | flatten(levels=1) }}"
kernel_packages_by_distro:
ubuntu:
- - "linux-image"
- - "linux-headers"
- - "linux-modules"
- - "linux-modules-extra"
- - "linux-tools"
+ bionic:
+ - "linux-image"
+ - "linux-headers"
+ - "linux-modules"
+ - "linux-modules-extra"
+ - "linux-tools"
+ focal:
+ - "linux-image"
+ - "linux-headers"
+ - "linux-modules"
+ - "linux-modules-extra"
+ - "linux-tools"
# Packages to remove in relation to kernel upgrade.
-absent_packages: "{{ absent_packages_by_distro[ansible_distribution | lower] | flatten(levels=1) }}"
+absent_packages: "{{ absent_packages_by_distro[ansible_distribution|lower][ansible_distribution_release] | flatten(levels=1) }}"
absent_packages_by_distro:
ubuntu:
- - "amd64-microcode"
- - "intel-microcode"
- - "iucode-tool"
+ bionic:
+ - "amd64-microcode"
+ - "intel-microcode"
+ - "iucode-tool"
+ focal:
+ - "amd64-microcode"
+ - "intel-microcode"
+ - "iucode-tool" \ No newline at end of file
diff --git a/resources/tools/testbed-setup/ansible/roles/kernel/handlers/main.yaml b/resources/tools/testbed-setup/ansible/roles/kernel/handlers/main.yaml
index 3d30a0973c..963fd71756 100644
--- a/resources/tools/testbed-setup/ansible/roles/kernel/handlers/main.yaml
+++ b/resources/tools/testbed-setup/ansible/roles/kernel/handlers/main.yaml
@@ -4,3 +4,5 @@
- name: Reboot Server
reboot:
reboot_timeout: 3600
+ tags:
+ - reboot-server \ No newline at end of file
diff --git a/resources/tools/testbed-setup/ansible/roles/kernel/tasks/main.yaml b/resources/tools/testbed-setup/ansible/roles/kernel/tasks/main.yaml
index 4c63c709a6..431e344fb8 100644
--- a/resources/tools/testbed-setup/ansible/roles/kernel/tasks/main.yaml
+++ b/resources/tools/testbed-setup/ansible/roles/kernel/tasks/main.yaml
@@ -1,8 +1,9 @@
---
# file: roles/kernel/tasks/main.yaml
-- name: Kernel - Install distribution - release
- include_tasks: '{{ ansible_distribution|lower }}_{{ ansible_distribution_release }}.yaml'
- tags: install-kernel
+- name: Inst - Prerequisites
+ include_tasks: "{{ ansible_distribution|lower }}_{{ ansible_distribution_release }}.yaml"
+ tags:
+ - kernel-inst-prerequisites
- meta: flush_handlers
diff --git a/resources/tools/testbed-setup/ansible/roles/kernel/tasks/ubuntu_bionic.yaml b/resources/tools/testbed-setup/ansible/roles/kernel/tasks/ubuntu_bionic.yaml
index 3cb79352ee..349c853c11 100644
--- a/resources/tools/testbed-setup/ansible/roles/kernel/tasks/ubuntu_bionic.yaml
+++ b/resources/tools/testbed-setup/ansible/roles/kernel/tasks/ubuntu_bionic.yaml
@@ -1,33 +1,37 @@
---
# file: roles/kernel/tasks/ubuntu_bionic.yaml
-- name: Kernel - Get Available Kernel Versions
+- name: Get Available Kernel Versions
command: "apt-cache showpkg linux-headers-*"
changed_when: false
register: apt_kernel_list
- tags: install-kernel
+ tags:
+ - kernel-inst
-- name: Kernel - Get installed packages with APT
+- name: Get installed packages with APT
command: "dpkg -l"
changed_when: false
register: apt_packages_list
- tags: install-kernel
+ tags:
+ - kernel-inst
-- name: Kernel - Set target APT kernel version
+- name: Set target APT kernel version
set_fact:
_kernel: "{{ apt_kernel_list | deb_kernel(
kernel_version, ansible_kernel) }}"
- tags: install-kernel
+ tags:
+ - kernel-inst
-- name: Kernel - Ensure Packages Versions
+- name: Ensure Packages Versions
apt:
name: "{{ apt_kernel_list | deb_kernel_pkg(
kernel_version, ansible_kernel, ansible_distribution,
ansible_architecture, item) }}"
loop: "{{ kernel_packages }}"
- tags: install-kernel
+ tags:
+ - kernel-inst
-- name: Kernel - Ensure Any Other Kernel Packages Are Removed
+- name: Ensure Any Other Kernel Packages Are Removed
apt:
name: "{{ apt_packages_list | deb_installed_kernel(
apt_kernel_list, kernel_version, ansible_kernel) }}"
@@ -35,12 +39,13 @@
purge: true
notify:
- "Reboot Server"
- tags: install-kernel
+ tags:
+ - kernel-inst
-- name: Kernel - Ensure Any Microcode Is Absent
+- name: Ensure Any Microcode Is Absent
apt:
name: "{{ absent_packages }}"
state: absent
purge: true
- tags: install-kernel
-
+ tags:
+ - kernel-inst \ No newline at end of file
diff --git a/resources/tools/testbed-setup/ansible/roles/kernel/tasks/ubuntu_focal.yaml b/resources/tools/testbed-setup/ansible/roles/kernel/tasks/ubuntu_focal.yaml
new file mode 100644
index 0000000000..9cbc4d4787
--- /dev/null
+++ b/resources/tools/testbed-setup/ansible/roles/kernel/tasks/ubuntu_focal.yaml
@@ -0,0 +1,51 @@
+---
+# file: roles/kernel/tasks/ubuntu_focal.yaml
+
+- name: Get Available Kernel Versions
+ command: "apt-cache showpkg linux-headers-*"
+ changed_when: false
+ register: apt_kernel_list
+ tags:
+ - kernel-inst
+
+- name: Get installed packages with APT
+ command: "dpkg -l"
+ changed_when: false
+ register: apt_packages_list
+ tags:
+ - kernel-inst
+
+- name: Set target APT kernel version
+ set_fact:
+ _kernel: "{{ apt_kernel_list | deb_kernel(
+ kernel_version, ansible_kernel) }}"
+ tags:
+ - kernel-inst
+
+- name: Ensure Packages Versions
+ apt:
+ name: "{{ apt_kernel_list | deb_kernel_pkg(
+ kernel_version, ansible_kernel, ansible_distribution,
+ ansible_architecture, item) }}"
+ loop: "{{ kernel_packages }}"
+ tags:
+ - kernel-inst
+
+- name: Ensure Any Other Kernel Packages Are Removed
+ apt:
+ name: "{{ apt_packages_list | deb_installed_kernel(
+ apt_kernel_list, kernel_version, ansible_kernel) }}"
+ state: absent
+ purge: true
+ notify:
+ - "Reboot Server"
+ tags:
+ - kernel-inst
+
+- name: Ensure Any Microcode Is Absent
+ apt:
+ name: "{{ absent_packages }}"
+ state: absent
+ purge: true
+ tags:
+ - kernel-inst \ No newline at end of file
diff --git a/resources/tools/testbed-setup/ansible/roles/kernel_vm/tasks/main.yaml b/resources/tools/testbed-setup/ansible/roles/kernel_vm/tasks/main.yaml
index ba9426a55f..4d1b306e64 100644
--- a/resources/tools/testbed-setup/ansible/roles/kernel_vm/tasks/main.yaml
+++ b/resources/tools/testbed-setup/ansible/roles/kernel_vm/tasks/main.yaml
@@ -1,7 +1,7 @@
---
# file: roles/kernel_vm/tasks/main.yaml
-- name: Kernel VM - Backup remote initramfs modules
+- name: Inst - Backup remote initramfs modules
copy:
src: "/etc/initramfs-tools/modules"
dest: "/tmp/initramfs_modules.bkp"
@@ -9,9 +9,9 @@
ignore_errors: yes
register: __initramfs_modules_backuped
tags:
- - install-kernel-image
+ - kernel-inst-image
-- name: Kernel VM - Backup remote initramfs resume config
+- name: Inst - Backup remote initramfs resume config
copy:
src: "/etc/initramfs-tools/conf.d/resume"
dest: "/tmp/initramfs-resume.bkp"
@@ -19,43 +19,43 @@
ignore_errors: yes
register: __initramfs_resume_backuped
tags:
- - install-kernel-image
+ - kernel-inst-image
-- name: Kernel VM - Update remote initramfs modules
+- name: Inst - Update remote initramfs modules
copy:
src: "../files/initramfs_modules"
dest: "/etc/initramfs-tools/modules"
tags:
- - install-kernel-image
+ - kernel-inst-image
-- name: Kernel VM - Update remote initramfs resume config
+- name: Inst - Update remote initramfs resume config
copy:
src: "../files/initramfs_resume"
dest: "/etc/initramfs-tools/conf.d/resume"
tags:
- - install-kernel-image
+ - kernel-inst-image
-- name: Kernel VM - Create target kernel dir
+- name: Inst - Create target kernel dir
file:
path: "/opt/boot"
state: "directory"
tags:
- - install-kernel-image
+ - kernel-inst-image
-- name: Kernel VM - Build initrd image
+- name: Inst - Build initrd image
shell: "update-initramfs -k {{ ansible_kernel }} -c -b /opt/boot"
tags:
- - install-kernel-image
+ - kernel-inst-image
-- name: Kernel VM - Copy corresponding kernel img
+- name: Inst - Copy corresponding kernel img
copy:
src: "/boot/vmlinuz-{{ ansible_kernel }}"
dest: "/opt/boot/vmlinuz-{{ ansible_kernel }}"
remote_src: yes
tags:
- - install-kernel-image
+ - kernel-inst-image
-- name: Kernel VM - Restore remote initramfs modules
+- name: Inst - Restore remote initramfs modules
copy:
src: "/tmp/initramfs_modules.bkp"
dest: "/etc/initramfs-tools/modules"
@@ -63,17 +63,17 @@
ignore_errors: yes
when: __initramfs_modules_backuped
tags:
- - install-kernel-image
+ - kernel-inst-image
-- name: Kernel VM - Remove remote backup initramfs modules
+- name: Inst - Remove remote backup initramfs modules
file:
path: "/tmp/initramfs_modules.bkp"
state: "absent"
when: __initramfs_modules_backuped
tags:
- - install-kernel-image
+ - kernel-inst-image
-- name: Kernel VM - Restore remote initramfs resume config
+- name: Inst - Restore remote initramfs resume config
copy:
src: "/tmp/initramfs-resume.bkp"
dest: "/etc/initramfs-tools/conf.d/resume"
@@ -81,12 +81,12 @@
ignore_errors: yes
when: __initramfs_resume_backuped
tags:
- - install-kernel-image
+ - kernel-inst-image
-- name: Kernel VM - Remove remote backup initramfs resume config
+- name: Inst - Remove remote backup initramfs resume config
file:
path: "/tmp/initramfs-resume.bkp"
state: "absent"
when: __initramfs_resume_backuped
tags:
- - install-kernel-image
+ - kernel-inst-image
diff --git a/resources/tools/testbed-setup/ansible/roles/kubernetes/tasks/ubuntu_bionic.yaml b/resources/tools/testbed-setup/ansible/roles/kubernetes/tasks/ubuntu_bionic.yaml
index ddb885f6ee..454e80e002 100644
--- a/resources/tools/testbed-setup/ansible/roles/kubernetes/tasks/ubuntu_bionic.yaml
+++ b/resources/tools/testbed-setup/ansible/roles/kubernetes/tasks/ubuntu_bionic.yaml
@@ -22,7 +22,7 @@
apt_repository:
repo: '{{ kubernetes_apt_repository }}'
state: '{{ kubernetes_apt_repository_state }}'
- update_cache: True
+ update_cache: yes
tags: install-kubernetes
- name: Kubernetes - Install
diff --git a/resources/tools/testbed-setup/ansible/roles/mellanox/defaults/main.yaml b/resources/tools/testbed-setup/ansible/roles/mellanox/defaults/main.yaml
index a77bc67ec7..0961ec7df6 100644
--- a/resources/tools/testbed-setup/ansible/roles/mellanox/defaults/main.yaml
+++ b/resources/tools/testbed-setup/ansible/roles/mellanox/defaults/main.yaml
@@ -18,4 +18,4 @@ packages_by_arch:
x86_64:
- []
-mellanox_version: "4.6-1.0.1.1"
+mellanox_version: "5.2-1.0.4.0" \ No newline at end of file
diff --git a/resources/tools/testbed-setup/ansible/roles/mellanox/tasks/main.yaml b/resources/tools/testbed-setup/ansible/roles/mellanox/tasks/main.yaml
index 2fb6e2e213..670282923a 100644
--- a/resources/tools/testbed-setup/ansible/roles/mellanox/tasks/main.yaml
+++ b/resources/tools/testbed-setup/ansible/roles/mellanox/tasks/main.yaml
@@ -1,32 +1,40 @@
---
# file: roles/mellanox/tasks/main.yaml
-- name: Mellanox Install - Install Distribution - Release - Machine Prerequisites
+- name: Inst - Update Package Cache (APT)
+ apt:
+ update_cache: yes
+ cache_valid_time: 3600
+ when:
+ - ansible_distribution|lower == 'ubuntu'
+ tags:
+ - mellanox-inst-prerequisites
+
+- name: Inst - Prerequisites
package:
name: "{{ packages | flatten(levels=1) }}"
state: latest
- update_cache: true
tags:
- - install-dependencies
+ - mellanox-inst-prerequisites
-- name: Mellanox Install - Check Presence of Mellanox Hardware
+- name: Inst - Check Presence of Mellanox Hardware
shell: "lspci | grep Mellanox | awk '{print $1}'"
register: mellanox_pcis
failed_when: no
changed_when: no
tags:
- - install-mellanox
+ - mellanox-inst
-- name: Mellanox Install - Get OFED
+- name: Inst - Get OFED
get_url:
url: "http://content.mellanox.com/ofed/MLNX_OFED-{{ mellanox_version }}/MLNX_OFED_LINUX-{{ mellanox_version }}-{{ ansible_distribution|lower }}{{ ansible_distribution_version }}-{{ ansible_machine }}.tgz"
dest: "/opt/MLNX_OFED_LINUX-{{ mellanox_version }}-{{ ansible_distribution|lower }}{{ ansible_distribution_version }}-{{ ansible_machine }}.tgz"
mode: 0644
when: mellanox_pcis.stdout_lines | length > 0
tags:
- - install-mellanox
+ - mellanox-inst
-- name: Mellanox Install - Extract OFED
+- name: Inst - Extract OFED
unarchive:
remote_src: true
src: "/opt/MLNX_OFED_LINUX-{{ mellanox_version }}-{{ ansible_distribution|lower }}{{ ansible_distribution_version }}-{{ ansible_machine }}.tgz"
@@ -35,26 +43,25 @@
register: mellanox_firmware_extracted
when: mellanox_pcis.stdout_lines | length > 0
tags:
- - install-mellanox
+ - mellanox-inst
-- name: Mellanox Install - Install OFED
+- name: Inst - OFED
command: "./mlnxofedinstall --with-mft --dpdk --force --upstream-libs"
args:
chdir: "/opt/MLNX_OFED_LINUX-{{ mellanox_version }}-{{ ansible_distribution|lower }}{{ ansible_distribution_version }}-{{ ansible_machine }}"
when: mellanox_pcis.stdout_lines | length > 0 and mellanox_firmware_extracted
tags:
- - install-mellanox
+ - mellanox-inst
-- name: Mellanox Install - Switch Infiniband to Ethernet
+- name: Switch Infiniband to Ethernet
command: "mlxconfig --yes --dev {{ item }} set LINK_TYPE_P1=2 LINK_TYPE_P2=2"
with_items: "{{ mellanox_pcis.stdout_lines }}"
tags:
- - install-mellanox
+ - mellanox-conf
-- name: Mellanox Install - FIX qemu-system removal
+- name: FIX qemu-system removal
package:
name: "qemu-system"
state: latest
- update_cache: true
tags:
- - install-mellanox
+ - mellanox-inst
diff --git a/resources/tools/testbed-setup/ansible/roles/nomad/tasks/main.yaml b/resources/tools/testbed-setup/ansible/roles/nomad/tasks/main.yaml
index e1341b3361..54e80513b8 100644
--- a/resources/tools/testbed-setup/ansible/roles/nomad/tasks/main.yaml
+++ b/resources/tools/testbed-setup/ansible/roles/nomad/tasks/main.yaml
@@ -1,11 +1,19 @@
---
# file: roles/nomad/tasks/main.yaml
+- name: Inst - Update Package Cache (APT)
+ apt:
+ update_cache: yes
+ cache_valid_time: 3600
+ when:
+ - ansible_distribution|lower == 'ubuntu'
+ tags:
+ - nomad-inst-prerequisites
+
- name: Inst - Prerequisites
package:
name: "{{ packages | flatten(levels=1) }}"
state: latest
- update_cache: true
tags:
- nomad-inst-prerequisites
diff --git a/resources/tools/testbed-setup/ansible/roles/performance_tuning/defaults/main.yaml b/resources/tools/testbed-setup/ansible/roles/performance_tuning/defaults/main.yaml
index 612bc5a946..2dad931e92 100644
--- a/resources/tools/testbed-setup/ansible/roles/performance_tuning/defaults/main.yaml
+++ b/resources/tools/testbed-setup/ansible/roles/performance_tuning/defaults/main.yaml
@@ -1,14 +1,17 @@
---
# file: roles/performance_tuning/defaults/main.yaml
-packages: "{{ packages_base + packages_by_distro[ansible_distribution | lower] + packages_by_arch[ansible_machine] }}"
+packages: "{{ packages_base + packages_by_distro[ansible_distribution|lower][ansible_distribution_release] + packages_by_arch[ansible_machine] }}"
packages_base:
- "cpufrequtils"
packages_by_distro:
ubuntu:
- - []
+ bionic:
+ - []
+ focal:
+ - []
packages_by_arch:
aarch64:
diff --git a/resources/tools/testbed-setup/ansible/roles/performance_tuning/tasks/main.yaml b/resources/tools/testbed-setup/ansible/roles/performance_tuning/tasks/main.yaml
index c437b40f86..e9cdd0d819 100644
--- a/resources/tools/testbed-setup/ansible/roles/performance_tuning/tasks/main.yaml
+++ b/resources/tools/testbed-setup/ansible/roles/performance_tuning/tasks/main.yaml
@@ -1,41 +1,31 @@
---
# file: roles/performance_tuning/tasks/main.yaml
-- name: Performance Tuning - Install Distribution - Release - Machine Prerequisites
+- name: Inst - Update Package Cache (APT)
+ apt:
+ update_cache: yes
+ cache_valid_time: 3600
+ when:
+ - ansible_distribution|lower == 'ubuntu'
+ tags:
+ - perf-inst-prerequisites
+
+- name: Inst - Machine Prerequisites
package:
name: "{{ packages | flatten(levels=1) }}"
state: latest
- update_cache: true
- tags:
- - install-dependencies
-
-- name: Performance Tuning - Distribution - release - machine optimizations
- include_tasks: '{{ ansible_distribution|lower }}_{{ ansible_distribution_release }}.yaml'
- tags:
- - machine-optimizations
-
-- name: Performance Tuning - Configure {{ ansible_machine }} Kernel Parameters
- lineinfile:
- path: "/etc/default/grub"
- state: "present"
- regexp: "^GRUB_CMDLINE_LINUX="
- line: "GRUB_CMDLINE_LINUX=\"{% for key, value in grub.items() %}{% if value %}{{key}}={{value}} {% else %}{{key}} {% endif %}{% endfor %}\""
- notify:
- - "Update GRUB"
tags:
- - set-grub
-
-- meta: flush_handlers
+ - perf-inst-prerequisites
-- name: Performance Tuning - Turbo Boost
+- name: Conf - Turbo Boost
import_tasks: turbo_boost.yaml
when: >
cpu_microarchitecture == "skylake" or
cpu_microarchitecture == "cascadelake"
tags:
- - turbo-boost
+ - perf-conf-turbo-boost
-- name: Performance Tuning - Adjust nr_hugepages
+- name: Conf - Adjust nr_hugepages
# change the minimum size of the hugepage pool.
# 2G VPP, 4GB per VNF/CNF, 2G reserve
sysctl:
@@ -45,9 +35,9 @@
sysctl_file: "/etc/sysctl.d/90-csit.conf"
reload: "yes"
tags:
- - set-sysctl
+ - perf-conf-sysctl
-- name: Performance Tuning - Adjust max_map_count
+- name: Conf - Adjust max_map_count
# this file contains the maximum number of memory map areas a process
# may have. memory map areas are used as a side-effect of calling
# malloc, directly by mmap and mprotect, and also when loading shared
@@ -64,9 +54,9 @@
sysctl_file: "/etc/sysctl.d/90-csit.conf"
reload: "yes"
tags:
- - set-sysctl
+ - perf-conf-sysctl
-- name: Performance Tuning - Adjust hugetlb_shm_group
+- name: Conf - Adjust hugetlb_shm_group
# hugetlb_shm_group contains group id that is allowed to create sysv
# shared memory segment using hugetlb page.
sysctl:
@@ -76,9 +66,9 @@
sysctl_file: "/etc/sysctl.d/90-csit.conf"
reload: "yes"
tags:
- - set-sysctl
+ - perf-conf-sysctl
-- name: Performance Tuning - Adjust swappiness
+- name: Conf - Adjust swappiness
# this control is used to define how aggressive the kernel will swap
# memory pages. higher values will increase agressiveness, lower values
# decrease the amount of swap. a value of 0 instructs the kernel not to
@@ -91,9 +81,9 @@
sysctl_file: "/etc/sysctl.d/90-csit.conf"
reload: "yes"
tags:
- - set-sysctl
+ - perf-conf-sysctl
-- name: Performance Tuning - Adjust shmmax
+- name: Conf - Adjust shmmax
# shared memory max must be greator or equal to the total size of hugepages.
# for 2mb pages, totalhugepagesize = vm.nr_hugepages * 2 * 1024 * 1024
# if the existing kernel.shmmax setting (cat /sys/proc/kernel/shmmax)
@@ -106,9 +96,9 @@
sysctl_file: "/etc/sysctl.d/90-csit.conf"
reload: "yes"
tags:
- - set-sysctl
+ - perf-conf-sysctl
-- name: Performance Tuning - Adjust watchdog_cpumask
+- name: Conf - Adjust watchdog_cpumask
# this value can be used to control on which cpus the watchdog may run.
# the default cpumask is all possible cores, but if no_hz_full is
# enabled in the kernel config, and cores are specified with the
@@ -126,9 +116,9 @@
sysctl_file: "/etc/sysctl.d/90-csit.conf"
reload: "yes"
tags:
- - set-sysctl
+ - perf-conf-sysctl
-- name: Performance Tuning - Adjust randomize_va_space
+- name: Conf - Adjust randomize_va_space
# this option can be used to select the type of process address
# space randomization that is used in the system, for architectures
# that support this feature.
@@ -142,45 +132,58 @@
sysctl_file: "/etc/sysctl.d/90-csit.conf"
reload: "yes"
tags:
- - set-sysctl
+ - perf-conf-sysctl
-- name: Performance Tuning - Copy Cpufrequtils File
+- name: Conf - Cpufrequtils
copy:
src: "files/cpufrequtils"
dest: "/etc/default/cpufrequtils"
owner: "root"
group: "root"
- mode: "0644"
+ mode: 0644
tags:
- - copy-cpufrequtils
+ - perf-conf-cpufrequtils
-- name: Performance Tuning - Copy Irqbalance File
+- name: Conf - Irqbalance
template:
src: "files/irqbalance"
dest: "/etc/default/irqbalance"
owner: "root"
group: "root"
- mode: "0644"
- notify:
- - "Update GRUB"
+ mode: 0644
tags:
- - copy-irqbalance
+ - perf-conf-irqbalance
-- name: Performance Tuning - Set Ondemand Service To Disable
+- name: Conf - Set Ondemand Service To Disable
service:
name: "ondemand"
enabled: "no"
tags:
- - set-ondemand
+ - perf-conf-ondemand
+
+- name: Conf - Kernel Parameters
+ lineinfile:
+ path: "/etc/default/grub"
+ state: "present"
+ regexp: "^GRUB_CMDLINE_LINUX="
+ line: "GRUB_CMDLINE_LINUX=\"{% for key, value in grub.items() %}{% if value is sameas true %}{{key}} {% else %}{{key}}={{value}} {% endif %}{% endfor %}\""
+ notify:
+ - "Update GRUB"
+ tags:
+ - perf-conf-grub
-- name: Performance Tuning - Load Kernel Modules By Default
+- meta: flush_handlers
+
+- name: Conf - Load Kernel Modules By Default
lineinfile:
path: "/etc/modules"
state: "present"
line: "{{ item }}"
with_items:
- "vfio-pci"
+ notify:
+ - "Reboot Server"
tags:
- - load-kernel-modules
+ - perf-conf-load-kernel-modules
-- meta: flush_handlers
+- meta: flush_handlers \ No newline at end of file
diff --git a/resources/tools/testbed-setup/ansible/roles/performance_tuning/tasks/turbo_boost.yaml b/resources/tools/testbed-setup/ansible/roles/performance_tuning/tasks/turbo_boost.yaml
index 310803ca5b..7f69365a2d 100644
--- a/resources/tools/testbed-setup/ansible/roles/performance_tuning/tasks/turbo_boost.yaml
+++ b/resources/tools/testbed-setup/ansible/roles/performance_tuning/tasks/turbo_boost.yaml
@@ -1,36 +1,44 @@
---
# file: roles/performance_tuning/tasks/turbo_boost.yaml
-- name: Turbo Boost - Install msr-tools
+- name: Inst - Update Package Cache (APT)
+ apt:
+ update_cache: yes
+ cache_valid_time: 3600
+ when:
+ - ansible_distribution|lower == 'ubuntu'
+ tags:
+ - turbo-inst-prerequisites
+
+- name: Inst - msr-tools
package:
name:
- "msr-tools"
state: latest
- update_cache: true
tags:
- - turbo-boost
+ - turbo-inst-prerequisites
-- name: Turbo Boost - Load msr By Default
+- name: Conf - Load msr By Default
lineinfile:
path: "/etc/modules"
state: "present"
line: "msr"
tags:
- - turbo-boost
+ - turbo-conf-msr
-- name: Turbo Boost - Custom Startup Service Hook
+- name: Conf - Custom Startup Service Hook
copy:
src: "files/disable-turbo-boost.service"
dest: "/etc/systemd/system/disable-turbo-boost.service"
owner: "root"
group: "root"
- mode: "0644"
+ mode: 0644
tags:
- - turbo-boost
+ - turbo-conf-msr
-- name: Turbo Boost - Custom Startup Service Hook Enable
+- name: Conf - Custom Startup Service Hook Enable
service:
name: "disable-turbo-boost"
enabled: yes
tags:
- - turbo-boost
+ - turbo-conf-msr
diff --git a/resources/tools/testbed-setup/ansible/roles/performance_tuning/tasks/ubuntu_bionic.yaml b/resources/tools/testbed-setup/ansible/roles/performance_tuning/tasks/ubuntu_bionic.yaml
deleted file mode 100644
index 273ad53f25..0000000000
--- a/resources/tools/testbed-setup/ansible/roles/performance_tuning/tasks/ubuntu_bionic.yaml
+++ /dev/null
@@ -1,18 +0,0 @@
----
-# file: roles/performance_tuning/tasks/ubuntu_bionic.yaml
-
-- name: Ubuntu Bionic - Mount /tmp as tmpfs I
- copy:
- src: "/usr/share/systemd/tmp.mount"
- dest: "/etc/systemd/system/tmp.mount"
- remote_src: yes
- tags:
- - machine-optimizations
-
-- name: Ubuntu Bionic - Mount /tmp as tmpfs II
- systemd:
- name: "tmp.mount"
- daemon_reload: yes
- enabled: yes
- tags:
- - machine-optimizations
diff --git a/resources/tools/testbed-setup/ansible/roles/prometheus_exporter/defaults/main.yaml b/resources/tools/testbed-setup/ansible/roles/prometheus_exporter/defaults/main.yaml
new file mode 100644
index 0000000000..eb2b94cb26
--- /dev/null
+++ b/resources/tools/testbed-setup/ansible/roles/prometheus_exporter/defaults/main.yaml
@@ -0,0 +1,17 @@
+---
+# file: roles/prometheus_exporter/defaults/main.yaml
+
+# Inst - Exporters.
+ne_packages: "{{ ne_packages_by_distro[ansible_distribution | lower][ansible_machine] }}"
+
+ne_packages_by_distro:
+ ubuntu:
+ aarch64: "http://ports.ubuntu.com/pool/universe/p/prometheus-node-exporter/prometheus-node-exporter_1.0.1+ds-1_arm64.deb"
+ x86_64: "http://archive.ubuntu.com/ubuntu/pool/universe/p/prometheus-node-exporter/prometheus-node-exporter_1.0.1+ds-1_amd64.deb"
+
+be_packages: "{{ be_packages_by_distro[ansible_distribution | lower][ansible_machine] }}"
+
+be_packages_by_distro:
+ ubuntu:
+ aarch64: "http://ports.ubuntu.com/pool/universe/p/prometheus-blackbox-exporter/prometheus-blackbox-exporter_0.17.0+ds-1_arm64.deb"
+ x86_64: "http://archive.ubuntu.com/ubuntu/pool/universe/p/prometheus-blackbox-exporter/prometheus-blackbox-exporter_0.17.0+ds-1_amd64.deb"
diff --git a/resources/tools/testbed-setup/ansible/roles/prometheus_exporter/files/blackbox.yml b/resources/tools/testbed-setup/ansible/roles/prometheus_exporter/files/blackbox.yml
new file mode 100644
index 0000000000..f61c26e1a8
--- /dev/null
+++ b/resources/tools/testbed-setup/ansible/roles/prometheus_exporter/files/blackbox.yml
@@ -0,0 +1,25 @@
+modules:
+ http_2xx:
+ prober: http
+ timeout: 5s
+ http:
+ valid_http_versions: ["HTTP/1.1", "HTTP/2.0"]
+ no_follow_redirects: false
+ fail_if_ssl: false
+ fail_if_not_ssl: true
+ tls_config:
+ insecure_skip_verify: false
+ preferred_ip_protocol: "ip4"
+ icmp_v4:
+ prober: icmp
+ timeout: 5s
+ icmp:
+ preferred_ip_protocol: "ip4"
+ dns_udp:
+ prober: dns
+ timeout: 5s
+ dns:
+ query_name: "jenkins.fd.io"
+ query_type: "A"
+ valid_rcodes:
+ - NOERROR \ No newline at end of file
diff --git a/resources/tools/testbed-setup/ansible/roles/prometheus_exporter/handlers/main.yaml b/resources/tools/testbed-setup/ansible/roles/prometheus_exporter/handlers/main.yaml
new file mode 100644
index 0000000000..9c374eaa61
--- /dev/null
+++ b/resources/tools/testbed-setup/ansible/roles/prometheus_exporter/handlers/main.yaml
@@ -0,0 +1,16 @@
+---
+# file roles/prometheus_exporter/handlers/main.yaml
+
+- name: Restart Prometheus Node Exporter
+ systemd:
+ daemon_reload: true
+ enabled: true
+ name: "prometheus-node-exporter"
+ state: "restarted"
+
+- name: Restart Prometheus Blackbox Exporter
+ systemd:
+ daemon_reload: true
+ enabled: true
+ name: "prometheus-blackbox-exporter"
+ state: "restarted" \ No newline at end of file
diff --git a/resources/tools/testbed-setup/ansible/roles/prometheus_exporter/tasks/main.yaml b/resources/tools/testbed-setup/ansible/roles/prometheus_exporter/tasks/main.yaml
new file mode 100644
index 0000000000..b38215c4a2
--- /dev/null
+++ b/resources/tools/testbed-setup/ansible/roles/prometheus_exporter/tasks/main.yaml
@@ -0,0 +1,15 @@
+---
+# file: roles/prometheus_exporter/tasks/main.yaml
+
+- include_tasks: "{{ ansible_distribution|lower }}_{{ ansible_distribution_release }}.yaml"
+ tags:
+ - prometheus-inst
+
+- name: Conf - Prometheus Blackbox Exporter
+ copy:
+ src: 'files/blackbox.yml'
+ dest: '/etc/prometheus/blackbox.yml'
+ notify:
+ - "Restart Prometheus Blackbox Exporter"
+ tags:
+ - prometheus-conf-blackbox-exporter \ No newline at end of file
diff --git a/resources/tools/testbed-setup/ansible/roles/prometheus_exporter/tasks/ubuntu_bionic.yaml b/resources/tools/testbed-setup/ansible/roles/prometheus_exporter/tasks/ubuntu_bionic.yaml
new file mode 100644
index 0000000000..566753e272
--- /dev/null
+++ b/resources/tools/testbed-setup/ansible/roles/prometheus_exporter/tasks/ubuntu_bionic.yaml
@@ -0,0 +1,33 @@
+---
+# file: roles/prometheus_exporter/tasks/ubuntu_bionic.yaml
+
+- name: Inst - Update Package Cache (APT)
+ apt:
+ update_cache: yes
+ cache_valid_time: 3600
+ tags:
+ - prometheus-inst-prerequisites
+
+- name: Inst - Prerequisites
+ package:
+ name: "init-system-helpers"
+ default_release: "bionic-backports"
+ state: latest
+ tags:
+ - prometheus-inst-prerequisites
+
+- name: Inst - Prometheus Node Exporter
+ apt:
+ deb: "{{ ne_packages }}"
+ notify:
+ - "Restart Prometheus Node Exporter"
+ tags:
+ - prometheus-inst-node-exporter
+
+- name: Inst - Prometheus Blackbox Exporter
+ apt:
+ deb: "{{ be_packages }}"
+ notify:
+ - "Restart Prometheus Blackbox Exporter"
+ tags:
+ - prometheus-inst-blackbox-exporter \ No newline at end of file
diff --git a/resources/tools/testbed-setup/ansible/roles/tg/files/csit-initialize-docker-tg.sh b/resources/tools/testbed-setup/ansible/roles/tg/files/csit-initialize-docker-tg.sh
index 7b90d20bda..1192feeee9 100755
--- a/resources/tools/testbed-setup/ansible/roles/tg/files/csit-initialize-docker-tg.sh
+++ b/resources/tools/testbed-setup/ansible/roles/tg/files/csit-initialize-docker-tg.sh
@@ -24,7 +24,7 @@ case "${1:-start}" in
docker network create --driver bridge csit-nw-tg${cnt} || true
# If the IMAGE is not already loaded then docker run will pull the
# IMAGE, and all image dependencies, before it starts the container.
- dcr_image="snergster/csit-sut:latest"
+ dcr_image="csit_sut-ubuntu2004:local"
# Run the container in the background and print the new container
# ID.
dcr_stc_params="--detach=true "
@@ -33,7 +33,7 @@ case "${1:-start}" in
# containers.
dcr_stc_params+="--privileged "
# Publish all exposed ports to random ports on the host interfaces.
- dcr_stc_params+="--publish 600${cnt}:22 "
+ dcr_stc_params+="--publish 600${cnt}:2222 "
# Automatically remove the container when it exits.
dcr_stc_params+="--rm "
# Size of /dev/shm.
diff --git a/resources/tools/testbed-setup/ansible/roles/tg/tasks/main.yaml b/resources/tools/testbed-setup/ansible/roles/tg/tasks/main.yaml
index d7a9ed882d..4e79dabfab 100644
--- a/resources/tools/testbed-setup/ansible/roles/tg/tasks/main.yaml
+++ b/resources/tools/testbed-setup/ansible/roles/tg/tasks/main.yaml
@@ -1,28 +1,30 @@
---
# file: roles/tg/tasks/main.yaml
-- name: TG - Copy csit-initialize-docker-tg.sh
+- name: Conf - csit-initialize-docker-tg.sh
copy:
src: "files/csit-initialize-docker-tg.sh"
dest: "/usr/local/bin/csit-initialize-docker-tg.sh"
owner: "root"
group: "root"
- mode: "744"
- when: docker_tg is defined
+ mode: 0744
+ when:
+ - docker_tg is defined
tags:
- - docker-tg
+ - tg-conf-docker
-- name: TG - Start csit-initialize-docker-tg.service
+- name: Conf - Start csit-initialize-docker-tg.service
copy:
src: "files/csit-initialize-docker-tg.service"
dest: "/etc/systemd/system/"
owner: "root"
group: "root"
- mode: "644"
+ mode: 0644
notify:
- "Start csit-initialize-docker-tg.service"
- when: docker_tg is defined
+ when:
+ - docker_tg is defined
tags:
- - docker-tg
+ - tg-conf-docker
- meta: flush_handlers
diff --git a/resources/tools/testbed-setup/ansible/roles/topology/tasks/main.yaml b/resources/tools/testbed-setup/ansible/roles/topology/tasks/main.yaml
index 9efdc71759..cf3eb5367f 100644
--- a/resources/tools/testbed-setup/ansible/roles/topology/tasks/main.yaml
+++ b/resources/tools/testbed-setup/ansible/roles/topology/tasks/main.yaml
@@ -3,7 +3,7 @@
- name: Create topology file
template:
- src: 'templates/topology_{{ cloud_topology }}.j2'
- dest: '../../../../topologies/available/{{ cloud_topology }}_testbed.yaml'
+ src: "templates/topology_{{ cloud_topology }}.j2"
+ dest: "../../../../topologies/available/{{ cloud_topology }}_{{ testbed_name }}.yaml"
tags:
- create-topology-file
diff --git a/resources/tools/testbed-setup/ansible/roles/trex/defaults/main.yaml b/resources/tools/testbed-setup/ansible/roles/trex/defaults/main.yaml
index c3caf52d76..19bb15e9d3 100644
--- a/resources/tools/testbed-setup/ansible/roles/trex/defaults/main.yaml
+++ b/resources/tools/testbed-setup/ansible/roles/trex/defaults/main.yaml
@@ -1,23 +1,35 @@
---
# file: roles/trex/defaults/main.yaml
-packages: "{{ packages_base + packages_by_distro[ansible_distribution | lower] + packages_by_arch[ansible_machine] }}"
+packages: "{{ packages_base + packages_by_distro[ansible_distribution|lower][ansible_distribution_release] + packages_by_arch[ansible_machine] }}"
packages_base:
- []
packages_by_distro:
ubuntu:
- - "build-essential"
- - "libmnl-dev"
- - "libnuma-dev"
- - "libpcap-dev"
- - "librdmacm-dev"
- - "librdmacm1"
- - "libssl-dev"
- - "pciutils"
- - "python3-pip"
- - "zlib1g-dev"
+ bionic:
+ - "build-essential"
+ - "libmnl-dev"
+ - "libnuma-dev"
+ - "libpcap-dev"
+ - "librdmacm-dev"
+ - "librdmacm1"
+ - "libssl-dev"
+ - "pciutils"
+ - "python3-pip"
+ - "zlib1g-dev"
+ focal:
+ - "build-essential"
+ - "libmnl-dev"
+ - "libnuma-dev"
+ - "libpcap-dev"
+ - "librdmacm-dev"
+ - "librdmacm1"
+ - "libssl-dev"
+ - "pciutils"
+ - "python3-pip"
+ - "zlib1g-dev"
packages_by_arch:
aarch64:
@@ -28,12 +40,5 @@ packages_by_arch:
trex_target_dir: "/opt"
trex_url: "https://github.com/cisco-system-traffic-generator/trex-core/archive/"
trex_version:
- # rls1908
- - "2.54"
- # rls2001
- # rls2005
- - "2.73"
- # rls2009
- - "2.82"
- # rls2101
- - "2.86"
+ # master // ubuntu 20.04
+ - "2.88" \ No newline at end of file
diff --git a/resources/tools/testbed-setup/ansible/roles/trex/tasks/deploy_block.yaml b/resources/tools/testbed-setup/ansible/roles/trex/tasks/deploy_block.yaml
index 1513a0a617..5a7890b071 100644
--- a/resources/tools/testbed-setup/ansible/roles/trex/tasks/deploy_block.yaml
+++ b/resources/tools/testbed-setup/ansible/roles/trex/tasks/deploy_block.yaml
@@ -1,7 +1,7 @@
---
# file: roles/trex/tasks/deploy_block.yaml
-- name: "Get Release {{ item }}"
+- name: Get Release {{ item }}
get_url:
url: "{{ trex_url }}/v{{ item }}.tar.gz"
dest: "{{ trex_target_dir }}/trex-core-{{ item }}.tar.gz"
@@ -9,12 +9,12 @@
mode: 0644
register: trex_downloaded
-- name: "Create Directory {{ item }}"
+- name: Create Directory {{ item }}
file:
path: "{{ trex_target_dir }}/trex-core-{{ item }}"
state: "directory"
-- name: "Extract Release {{ item }}"
+- name: Extract Release {{ item }}
unarchive:
remote_src: true
src: "{{ trex_target_dir }}/trex-core-{{ item }}.tar.gz"
@@ -30,25 +30,25 @@
when:
- azure is defined and item == "2.73"
-- name: "Compile Release {{ item }} Part I"
+- name: Compile Release {{ item }} Part I
command: "./b configure"
args:
chdir: "{{ trex_target_dir }}/trex-core-{{ item }}/linux_dpdk/"
when: trex_extracted.changed
-- name: "Compile Release {{ item }} Part II"
+- name: Compile Release {{ item }} Part II
command: "./b build"
args:
chdir: "{{ trex_target_dir }}/trex-core-{{ item }}/linux_dpdk/"
when: trex_extracted.changed
-- name: "Compile Release {{ item }} Part III"
+- name: Compile Release {{ item }} Part III
command: "make -j 16"
args:
chdir: "{{ trex_target_dir }}/trex-core-{{ item }}/scripts/ko/src"
when: trex_extracted.changed
-- name: "Compile Release {{ item }} Part IV"
+- name: Compile Release {{ item }} Part IV
command: "make install"
args:
chdir: "{{ trex_target_dir }}/trex-core-{{ item }}/scripts/ko/src"
diff --git a/resources/tools/testbed-setup/ansible/roles/trex/tasks/main.yaml b/resources/tools/testbed-setup/ansible/roles/trex/tasks/main.yaml
index 019a27f79d..d43baf909b 100644
--- a/resources/tools/testbed-setup/ansible/roles/trex/tasks/main.yaml
+++ b/resources/tools/testbed-setup/ansible/roles/trex/tasks/main.yaml
@@ -1,16 +1,24 @@
---
# file: roles/trex/tasks/main.yaml
-- name: Install Distribution - Release - Machine Prerequisites
+- name: Inst - Update Package Cache (APT)
+ apt:
+ update_cache: yes
+ cache_valid_time: 3600
+ when:
+ - ansible_distribution|lower == 'ubuntu'
+ tags:
+ - trex-inst-prerequisites
+
+- name: Inst - Prerequisites
package:
name: "{{ packages | flatten(levels=1) }}"
state: latest
- update_cache: true
tags:
- - install-dependencies
+ - trex-inst-prerequisites
-- name: Deploy Multiple T-Rex Versions
+- name: Inst - Multiple T-Rex Versions
include_tasks: deploy_block.yaml
loop: "{{ trex_version }}"
tags:
- - install-trex \ No newline at end of file
+ - trex-inst \ No newline at end of file
diff --git a/resources/tools/testbed-setup/ansible/roles/vpp/defaults/main.yaml b/resources/tools/testbed-setup/ansible/roles/vpp/defaults/main.yaml
index 323d00cf29..7fac499307 100644
--- a/resources/tools/testbed-setup/ansible/roles/vpp/defaults/main.yaml
+++ b/resources/tools/testbed-setup/ansible/roles/vpp/defaults/main.yaml
@@ -1,7 +1,7 @@
---
# file: roles/vpp/defaults/main.yaml
-packages: "{{ packages_base + packages_by_distro[ansible_distribution | lower] + packages_by_arch[ansible_machine] }}"
+packages: "{{ packages_base + packages_by_distro[ansible_distribution|lower][ansible_distribution_release] + packages_by_arch[ansible_machine] }}"
packages_base:
- "gdb"
@@ -12,13 +12,22 @@ packages_base:
packages_by_distro:
ubuntu:
- - "build-essential"
- - "libglib2.0-dev"
- - "libmbedcrypto1"
- - "libmbedtls10"
- - "libmbedx509-0"
- - "libnuma-dev"
- - "libpixman-1-dev"
+ bionic:
+ - "build-essential"
+ - "libglib2.0-dev"
+ - "libmbedcrypto1"
+ - "libmbedtls10"
+ - "libmbedx509-0"
+ - "libnuma-dev"
+ - "libpixman-1-dev"
+ focal:
+ - "build-essential"
+ - "libglib2.0-dev"
+ - "libmbedcrypto3"
+ - "libmbedtls12"
+ - "libmbedx509-0"
+ - "libnuma-dev"
+ - "libpixman-1-dev"
packages_by_arch:
aarch64:
diff --git a/resources/tools/testbed-setup/ansible/roles/vpp/tasks/main.yaml b/resources/tools/testbed-setup/ansible/roles/vpp/tasks/main.yaml
index 174373c110..ef03011b51 100644
--- a/resources/tools/testbed-setup/ansible/roles/vpp/tasks/main.yaml
+++ b/resources/tools/testbed-setup/ansible/roles/vpp/tasks/main.yaml
@@ -1,27 +1,27 @@
---
-# file: roles/sut/tasks/main.yaml
+# file: roles/vpp/tasks/main.yaml
-- name: SUT - Install Distribution - Release - Machine Prerequisites
+- name: Inst - Update Package Cache (APT)
+ apt:
+ update_cache: yes
+ cache_valid_time: 3600
+ when:
+ - ansible_distribution|lower == 'ubuntu'
+ tags:
+ - vpp-inst-prerequisites
+
+- name: Inst - Prerequisites
package:
name: "{{ packages | flatten(levels=1) }}"
state: latest
- update_cache: true
- tags:
- - install-dependencies
-
-- name: SUT - Install VPP 19.08 PIP requirements
- pip:
- name:
- - "aenum==2.1.2"
- executable: pip2
tags:
- - install-pip
+ - vpp-inst-prerequisites
-- name: SUT - Copy 80-vpp.conf
+- name: Conf - sysctl
file:
src: "/dev/null"
dest: "/etc/sysctl.d/80-vpp.conf"
state: "link"
become: yes
tags:
- - create-80-vpp
+ - vpp-conf-sysctl
diff --git a/resources/tools/testbed-setup/ansible/site.yaml b/resources/tools/testbed-setup/ansible/site.yaml
index 5ac76ca340..4436c21b18 100644
--- a/resources/tools/testbed-setup/ansible/site.yaml
+++ b/resources/tools/testbed-setup/ansible/site.yaml
@@ -2,16 +2,25 @@
# file: site.yaml
- import_playbook: tg.yaml
- tags: tg
+ tags:
+ - tg
+ - tg_aws
+ - tg_azure
- import_playbook: sut.yaml
- tags: sut
+ tags:
+ - sut
+ - sut_aws
+ - sut_azure
- import_playbook: vpp_device.yaml
- tags: vpp-device
+ tags:
+ - vpp-device
- import_playbook: nomad.yaml
- tags: nomad
+ tags:
+ - nomad
- import_playbook: dev.yaml
- tags: dev
+ tags:
+ - dev
diff --git a/resources/tools/testbed-setup/ansible/site_aws.yaml b/resources/tools/testbed-setup/ansible/site_aws.yaml
deleted file mode 100644
index 4e23a97f85..0000000000
--- a/resources/tools/testbed-setup/ansible/site_aws.yaml
+++ /dev/null
@@ -1,8 +0,0 @@
----
-# file: site_aws.yaml
-
-- import_playbook: tg_aws.yaml
- tags: tg
-
-- import_playbook: sut_aws.yaml
- tags: sut
diff --git a/resources/tools/testbed-setup/ansible/site_azure.yaml b/resources/tools/testbed-setup/ansible/site_azure.yaml
deleted file mode 100644
index c60e8b8f30..0000000000
--- a/resources/tools/testbed-setup/ansible/site_azure.yaml
+++ /dev/null
@@ -1,8 +0,0 @@
----
-# file: site_azure.yaml
-
-- import_playbook: tg_azure.yaml
- tags: tg
-
-- import_playbook: sut_azure.yaml
- tags: sut
diff --git a/resources/tools/testbed-setup/ansible/sut.yaml b/resources/tools/testbed-setup/ansible/sut.yaml
index 302fa1b78d..abc25a279c 100644
--- a/resources/tools/testbed-setup/ansible/sut.yaml
+++ b/resources/tools/testbed-setup/ansible/sut.yaml
@@ -5,6 +5,12 @@
remote_user: testuser
become: yes
become_user: root
+ gather_facts: false
+ pre_tasks:
+ - name: Gathering Facts
+ gather_facts:
+ tags:
+ - always
roles:
- role: baremetal
tags: baremetal
@@ -14,14 +20,12 @@
tags: kernel
- role: mellanox
tags: mellanox
+ - role: docker
+ tags: docker
- role: vpp
tags: vpp
- role: dpdk
tags: dpdk
- - role: docker
- tags: docker
- - role: kubernetes
- tags: kubernetes
- role: kernel_vm
tags: kernel_vm
- role: csit_sut_image
@@ -32,3 +36,66 @@
tags: cleanup
- role: calibration
tags: calibration
+
+- hosts: sut_aws
+ remote_user: testuser
+ become: yes
+ become_user: root
+ gather_facts: false
+ pre_tasks:
+ - name: Gathering Facts
+ gather_facts:
+ tags:
+ - always
+ roles:
+ - role: user_add
+ tags: user_add
+ - role: common
+ tags: common
+ - role: vpp
+ tags: vpp
+ - role: dpdk
+ tags: dpdk
+ - role: iperf
+ tags: iperf
+ - role: docker
+ tags: docker
+ - role: aws
+ tags: aws
+ - role: cleanup
+ tags: cleanup
+ # TODO: 'Check Kernel Parameters' failing in
+ # resources/tools/testbed-setup/ansible/roles/calibration/tasks/main.yaml
+ # - role: calibration
+ # tags: calibration
+
+- hosts: sut_azure
+ become: yes
+ become_user: root
+ gather_facts: false
+ pre_tasks:
+ - name: Gathering Facts
+ gather_facts:
+ tags:
+ - always
+ roles:
+ - role: user_add
+ tags: user_add
+ - role: common
+ tags: common
+ - role: docker
+ tags: docker
+ - role: vpp
+ tags: vpp
+ - role: iperf
+ tags: iperf
+ - role: dpdk
+ tags: dpdk
+ - role: azure
+ tags: azure
+ - role: cleanup
+ tags: cleanup
+ # TODO: 'Check Kernel Parameters' failing in
+ # resources/tools/testbed-setup/ansible/roles/calibration/tasks/main.yaml
+ # - role: calibration
+ # tags: calibration
diff --git a/resources/tools/testbed-setup/ansible/sut_aws.yaml b/resources/tools/testbed-setup/ansible/sut_aws.yaml
deleted file mode 100644
index a7642abfef..0000000000
--- a/resources/tools/testbed-setup/ansible/sut_aws.yaml
+++ /dev/null
@@ -1,25 +0,0 @@
----
-# file: sut_aws.yaml
-
-- hosts: sut
- become: yes
- become_user: root
- roles:
- - role: user_add
- tags: user_add
- - role: common
- tags: common
- - role: vpp
- tags: vpp
- - role: dpdk
- tags: dpdk
- - role: aws
- tags: aws
- - role: iperf
- tags: iperf
- - role: docker
- tags: docker
- - role: cleanup
- tags: cleanup
-# - role: calibration
-# tags: calibration
diff --git a/resources/tools/testbed-setup/ansible/sut_azure.yaml b/resources/tools/testbed-setup/ansible/sut_azure.yaml
deleted file mode 100644
index 835ab63d83..0000000000
--- a/resources/tools/testbed-setup/ansible/sut_azure.yaml
+++ /dev/null
@@ -1,25 +0,0 @@
----
-# file: sut_azure.yaml
-
-- hosts: sut
- become: yes
- become_user: root
- roles:
- - role: user_add
- tags: user_add
- - role: common
- tags: common
- - role: vpp
- tags: vpp
- - role: azure
- tags: azure
- - role: iperf
- tags: iperf
- - role: docker
- tags: docker
- - role: dpdk
- tags: dpdk
- - role: cleanup
- tags: cleanup
-# - role: calibration
-# tags: calibration
diff --git a/resources/tools/testbed-setup/ansible/tg.yaml b/resources/tools/testbed-setup/ansible/tg.yaml
index fc18038686..69728a694e 100644
--- a/resources/tools/testbed-setup/ansible/tg.yaml
+++ b/resources/tools/testbed-setup/ansible/tg.yaml
@@ -5,6 +5,12 @@
remote_user: testuser
become: yes
become_user: root
+ gather_facts: false
+ pre_tasks:
+ - name: Gathering Facts
+ gather_facts:
+ tags:
+ - always
roles:
- role: baremetal
tags: baremetal
@@ -14,19 +20,90 @@
tags: kernel
- role: mellanox
tags: mellanox
- - role: tg
- tags: tg
+ - role: docker
+ tags: docker
- role: iperf
tags: iperf
- role: trex
tags: trex
- role: ab
tags: ab
- - role: docker
- tags: docker
+ - role: tg
+ tags: tg
+ - role: csit_sut_image
+ tags: csit_sut_image
- role: performance_tuning
tags: performance_tuning
- role: cleanup
tags: cleanup
- role: calibration
tags: calibration
+
+- hosts: tg_aws
+ remote_user: testuser
+ become: yes
+ become_user: root
+ gather_facts: false
+ pre_tasks:
+ - name: Gathering Facts
+ gather_facts:
+ tags:
+ - always
+ roles:
+ - role: user_add
+ tags: user_add
+ - role: common
+ tags: common
+ - role: dpdk
+ tags: dpdk
+ - role: docker
+ tags: docker
+ - role: tg
+ tags: tg
+ - role: iperf
+ tags: iperf
+ - role: trex
+ tags: trex
+ - role: ab
+ tags: ab
+ - role: aws
+ tags: aws
+ - role: cleanup
+ tags: cleanup
+ # TODO: 'Check Kernel Parameters' failing in
+ # resources/tools/testbed-setup/ansible/roles/calibration/tasks/main.yaml
+ # - role: calibration
+ # tags: calibration
+
+- hosts: tg_azure
+ become: yes
+ become_user: root
+ gather_facts: false
+ pre_tasks:
+ - name: Gathering Facts
+ gather_facts:
+ tags:
+ - always
+ roles:
+ - role: user_add
+ tags: user_add
+ - role: common
+ tags: common
+ - role: docker
+ tags: docker
+ - role: tg
+ tags: tg
+ - role: iperf
+ tags: iperf
+ - role: trex
+ tags: trex
+ - role: ab
+ tags: ab
+ - role: azure
+ tags: azure
+ - role: cleanup
+ tags: cleanup
+ # TODO: 'Check Kernel Parameters' failing in
+ # resources/tools/testbed-setup/ansible/roles/calibration/tasks/main.yaml
+ # - role: calibration
+ # tags: calibration \ No newline at end of file
diff --git a/resources/tools/testbed-setup/ansible/tg_aws.yaml b/resources/tools/testbed-setup/ansible/tg_aws.yaml
deleted file mode 100644
index db0b574d15..0000000000
--- a/resources/tools/testbed-setup/ansible/tg_aws.yaml
+++ /dev/null
@@ -1,29 +0,0 @@
----
-# file: tg_aws.yaml
-
-- hosts: tg
- become: yes
- become_user: root
- roles:
- - role: user_add
- tags: user_add
- - role: common
- tags: common
- - role: dpdk
- tags: dpdk
- - role: aws
- tags: aws
- - role: tg
- tags: tg
- - role: iperf
- tags: iperf
- - role: trex
- tags: trex
- - role: ab
- tags: ab
- - role: docker
- tags: docker
- - role: cleanup
- tags: cleanup
-# - role: calibration
-# tags: calibration
diff --git a/resources/tools/testbed-setup/ansible/tg_azure.yaml b/resources/tools/testbed-setup/ansible/tg_azure.yaml
deleted file mode 100644
index e6940a7c36..0000000000
--- a/resources/tools/testbed-setup/ansible/tg_azure.yaml
+++ /dev/null
@@ -1,27 +0,0 @@
----
-# file: tg_azure.yaml
-
-- hosts: tg
- become: yes
- become_user: root
- roles:
- - role: user_add
- tags: user_add
- - role: common
- tags: common
- - role: azure
- tags: azure
- - role: tg
- tags: tg
- - role: iperf
- tags: iperf
- - role: trex
- tags: trex
- - role: ab
- tags: ab
- - role: docker
- tags: docker
- - role: cleanup
- tags: cleanup
-# - role: calibration
-# tags: calibration
diff --git a/resources/tools/testbed-setup/ansible/vpp_device.yaml b/resources/tools/testbed-setup/ansible/vpp_device.yaml
index ac42b8cafe..2ffea31f6b 100644
--- a/resources/tools/testbed-setup/ansible/vpp_device.yaml
+++ b/resources/tools/testbed-setup/ansible/vpp_device.yaml
@@ -5,6 +5,12 @@
remote_user: testuser
become: yes
become_user: root
+ gather_facts: false
+ pre_tasks:
+ - name: Gathering Facts
+ gather_facts:
+ tags:
+ - always
roles:
- role: user_add
tags: user_add
@@ -18,13 +24,17 @@
tags: nomad
- role: consul
tags: consul
+ - role: prometheus_exporter
+ tags: prometheus_exporter
+ - role: jenkins_job_health_exporter
+ tags: jenkins_job_health_exporter
+ - role: cadvisor
+ tags: cadvisor
- role: vpp_device
tags: vpp_device
- role: kernel_vm
tags: kernel_vm
- role: csit_sut_image
tags: csit_sut_image
- - role: csit_shim_image
- tags: csit_shim_image
- role: cleanup
tags: cleanup
diff --git a/terraform-ci-infra/1n_nmd/alertmanager/conf/nomad/alertmanager.hcl b/terraform-ci-infra/1n_nmd/alertmanager/conf/nomad/alertmanager.hcl
new file mode 100644
index 0000000000..4560cf07ab
--- /dev/null
+++ b/terraform-ci-infra/1n_nmd/alertmanager/conf/nomad/alertmanager.hcl
@@ -0,0 +1,333 @@
+job "${job_name}" {
+ # The "region" parameter specifies the region in which to execute the job.
+ # If omitted, this inherits the default region name of "global".
+ # region = "global"
+ #
+ # The "datacenters" parameter specifies the list of datacenters which should
+ # be considered when placing this task. This must be provided.
+ datacenters = "${datacenters}"
+
+ # The "type" parameter controls the type of job, which impacts the scheduler's
+ # decision on placement. This configuration is optional and defaults to
+ # "service". For a full list of job types and their differences, please see
+ # the online documentation.
+ #
+ # For more information, please see the online documentation at:
+ #
+ # https://www.nomadproject.io/docs/jobspec/schedulers
+ #
+ type = "service"
+
+ update {
+ # The "max_parallel" parameter specifies the maximum number of updates to
+ # perform in parallel. In this case, this specifies to update a single task
+ # at a time.
+ max_parallel = 1
+
+ health_check = "checks"
+
+ # The "min_healthy_time" parameter specifies the minimum time the allocation
+ # must be in the healthy state before it is marked as healthy and unblocks
+ # further allocations from being updated.
+ min_healthy_time = "10s"
+
+ # The "healthy_deadline" parameter specifies the deadline in which the
+ # allocation must be marked as healthy after which the allocation is
+ # automatically transitioned to unhealthy. Transitioning to unhealthy will
+ # fail the deployment and potentially roll back the job if "auto_revert" is
+ # set to true.
+ healthy_deadline = "3m"
+
+ # The "progress_deadline" parameter specifies the deadline in which an
+ # allocation must be marked as healthy. The deadline begins when the first
+ # allocation for the deployment is created and is reset whenever an allocation
+ # as part of the deployment transitions to a healthy state. If no allocation
+ # transitions to the healthy state before the progress deadline, the
+ # deployment is marked as failed.
+ progress_deadline = "10m"
+
+%{ if use_canary }
+ # The "canary" parameter specifies that changes to the job that would result
+ # in destructive updates should create the specified number of canaries
+ # without stopping any previous allocations. Once the operator determines the
+ # canaries are healthy, they can be promoted which unblocks a rolling update
+ # of the remaining allocations at a rate of "max_parallel".
+ #
+ # Further, setting "canary" equal to the count of the task group allows
+ # blue/green deployments. When the job is updated, a full set of the new
+ # version is deployed and upon promotion the old version is stopped.
+ canary = 1
+
+ # Specifies if the job should auto-promote to the canary version when all
+ # canaries become healthy during a deployment. Defaults to false which means
+ # canaries must be manually updated with the nomad deployment promote
+ # command.
+ auto_promote = true
+
+ # The "auto_revert" parameter specifies if the job should auto-revert to the
+ # last stable job on deployment failure. A job is marked as stable if all the
+ # allocations as part of its deployment were marked healthy.
+ auto_revert = true
+%{ endif }
+ }
+
+ # The "group" stanza defines a series of tasks that should be co-located on
+ # the same Nomad client. Any task within a group will be placed on the same
+ # client.
+ #
+ # For more information and examples on the "group" stanza, please see
+ # the online documentation at:
+ #
+ # https://www.nomadproject.io/docs/job-specification/group
+ #
+ group "prod-group1-${service_name}" {
+ # The "count" parameter specifies the number of the task groups that should
+ # be running under this group. This value must be non-negative and defaults
+ # to 1.
+ count = ${group_count}
+
+ # The constraint allows restricting the set of eligible nodes. Constraints
+ # may filter on attributes or client metadata.
+ #
+ # For more information and examples on the "volume" stanza, please see
+ # the online documentation at:
+ #
+ # https://www.nomadproject.io/docs/job-specification/constraint
+ #
+ constraint {
+ attribute = "$${attr.cpu.arch}"
+ operator = "!="
+ value = "arm64"
+ }
+
+ # The "task" stanza creates an individual unit of work, such as a Docker
+ # container, web application, or batch processing.
+ #
+ # For more information and examples on the "task" stanza, please see
+ # the online documentation at:
+ #
+ # https://www.nomadproject.io/docs/job-specification/task
+ #
+ task "prod-task1-${service_name}" {
+ # The "driver" parameter specifies the task driver that should be used to
+ # run the task.
+ driver = "exec"
+
+ %{ if use_vault_provider }
+ vault {
+ policies = "${vault_kv_policy_name}"
+ }
+ %{ endif }
+
+ # The "config" stanza specifies the driver configuration, which is passed
+ # directly to the driver to start the task. The details of configurations
+ # are specific to each driver, so please see specific driver
+ # documentation for more information.
+ config {
+ command = "local/alertmanager-${version}.linux-amd64/alertmanager"
+ args = [
+ "--config.file=secrets/alertmanager.yml"
+ ]
+ }
+
+ # The artifact stanza instructs Nomad to fetch and unpack a remote resource,
+ # such as a file, tarball, or binary. Nomad downloads artifacts using the
+ # popular go-getter library, which permits downloading artifacts from a
+ # variety of locations using a URL as the input source.
+ #
+ # For more information and examples on the "artifact" stanza, please see
+ # the online documentation at:
+ #
+ # https://www.nomadproject.io/docs/job-specification/artifact
+ #
+ artifact {
+ source = "${url}"
+ }
+
+ # The "template" stanza instructs Nomad to manage a template, such as
+ # a configuration file or script. This template can optionally pull data
+ # from Consul or Vault to populate runtime configuration data.
+ #
+ # For more information and examples on the "template" stanza, please see
+ # the online documentation at:
+ #
+ # https://www.nomadproject.io/docs/job-specification/template
+ #
+ template {
+ change_mode = "noop"
+ change_signal = "SIGINT"
+ destination = "secrets/alertmanager.yml"
+ left_delimiter = "{{{"
+ right_delimiter = "}}}"
+ data = <<EOH
+global:
+ # The API URL to use for Slack notifications.
+ slack_api_url: '${slack_api_url}'
+
+# The directory from which notification templates are read.
+templates:
+- '/etc/alertmanager/template/*.tmpl'
+
+#tls_config:
+# # CA certificate to validate the server certificate with.
+# ca_file: <filepath> ]
+#
+# # Certificate and key files for client cert authentication to the server.
+# cert_file: <filepath>
+# key_file: <filepath>
+#
+# # ServerName extension to indicate the name of the server.
+# # http://tools.ietf.org/html/rfc4366#section-3.1
+# server_name: <string>
+#
+# # Disable validation of the server certificate.
+# insecure_skip_verify: true
+
+# The root route on which each incoming alert enters.
+route:
+ receiver: '${default_receiver}'
+
+ # The labels by which incoming alerts are grouped together. For example,
+ # multiple alerts coming in for cluster=A and alertname=LatencyHigh would
+ # be batched into a single group.
+ #
+ # To aggregate by all possible labels use '...' as the sole label name.
+ # This effectively disables aggregation entirely, passing through all
+ # alerts as-is. This is unlikely to be what you want, unless you have
+ # a very low alert volume or your upstream notification system performs
+ # its own grouping. Example: group_by: [...]
+ group_by: ['alertname', 'cluster', 'service']
+
+ # When a new group of alerts is created by an incoming alert, wait at
+ # least 'group_wait' to send the initial notification.
+ # This way ensures that you get multiple alerts for the same group that start
+ # firing shortly after another are batched together on the first
+ # notification.
+ group_wait: 30s
+
+ # When the first notification was sent, wait 'group_interval' to send a batch
+ # of new alerts that started firing for that group.
+ group_interval: 5m
+
+ # If an alert has successfully been sent, wait 'repeat_interval' to
+ # resend them.
+ repeat_interval: 3h
+
+ # All the above attributes are inherited by all child routes and can
+ # overwritten on each.
+ # The child route trees.
+ routes:
+ # This routes performs a regular expression match on alert labels to
+ # catch alerts that are related to a list of services.
+ - match_re:
+ service: .*
+ receiver: ${default_receiver}
+ # The service has a sub-route for critical alerts, any alerts
+ # that do not match, i.e. severity != critical, fall-back to the
+ # parent node and are sent to 'team-X-mails'
+ routes:
+ - match:
+ severity: critical
+ receiver: '${default_receiver}'
+
+# Inhibition rules allow to mute a set of alerts given that another alert is
+# firing.
+# We use this to mute any warning-level notifications if the same alert is
+# already critical.
+inhibit_rules:
+- source_match:
+ severity: 'critical'
+ target_match:
+ severity: 'warning'
+ # Apply inhibition if the alertname is the same.
+ # CAUTION:
+ # If all label names listed in `equal` are missing
+ # from both the source and target alerts,
+ # the inhibition rule will apply!
+ equal: ['alertname', 'cluster', 'service']
+
+receivers:
+- name: '${default_receiver}'
+ slack_configs:
+ - channel: '#${slack_channel}'
+ send_resolved: true
+ icon_url: https://avatars3.githubusercontent.com/u/3380462
+ title: |-
+ [{{ .Status | toUpper }}{{ if eq .Status "firing" }}:{{ .Alerts.Firing | len }}{{ end }}] {{ .CommonLabels.alertname }} for {{ .CommonLabels.job }}
+ {{- if gt (len .CommonLabels) (len .GroupLabels) -}}
+ {{" "}}(
+ {{- with .CommonLabels.Remove .GroupLabels.Names }}
+ {{- range $index, $label := .SortedPairs -}}
+ {{ if $index }}, {{ end }}
+ {{- $label.Name }}="{{ $label.Value -}}"
+ {{- end }}
+ {{- end -}}
+ )
+ {{- end }}
+ text: >-
+ {{ range .Alerts -}}
+ *Alert:* {{ .Annotations.summary }}{{ if .Labels.severity }} - `{{ .Labels.severity }}`{{ end }}
+
+ *Description:* {{ .Annotations.description }}
+
+ *Details:*
+ {{ range .Labels.SortedPairs }} • *{{ .Name }}:* `{{ .Value }}`
+ {{ end }}
+ {{ end }}
+EOH
+ }
+
+ # The service stanza instructs Nomad to register a service with Consul.
+ #
+ # For more information and examples on the "task" stanza, please see
+ # the online documentation at:
+ #
+ # https://www.nomadproject.io/docs/job-specification/service
+ #
+ service {
+ name = "${service_name}"
+ port = "${service_name}"
+ tags = [ "${service_name}$${NOMAD_ALLOC_INDEX}" ]
+ check {
+ name = "Alertmanager Check Live"
+ type = "http"
+ path = "/-/healthy"
+ interval = "10s"
+ timeout = "2s"
+ }
+ }
+
+ # The "resources" stanza describes the requirements a task needs to
+ # execute. Resource requirements include memory, network, cpu, and more.
+ # This ensures the task will execute on a machine that contains enough
+ # resource capacity.
+ #
+ # For more information and examples on the "resources" stanza, please see
+ # the online documentation at:
+ #
+ # https://www.nomadproject.io/docs/job-specification/resources
+ #
+ resources {
+ cpu = ${cpu}
+ memory = ${mem}
+ # The network stanza specifies the networking requirements for the task
+ # group, including the network mode and port allocations. When scheduling
+ # jobs in Nomad they are provisioned across your fleet of machines along
+ # with other jobs and services. Because you don't know in advance what host
+ # your job will be provisioned on, Nomad will provide your tasks with
+ # network configuration when they start up.
+ #
+ # For more information and examples on the "template" stanza, please see
+ # the online documentation at:
+ #
+ # https://www.nomadproject.io/docs/job-specification/network
+ #
+ network {
+ port "${service_name}" {
+ static = ${port}
+ }
+ }
+ }
+ }
+ }
+} \ No newline at end of file
diff --git a/terraform-ci-infra/1n_nmd/alertmanager/main.tf b/terraform-ci-infra/1n_nmd/alertmanager/main.tf
new file mode 100644
index 0000000000..411c78a558
--- /dev/null
+++ b/terraform-ci-infra/1n_nmd/alertmanager/main.tf
@@ -0,0 +1,37 @@
+locals {
+ datacenters = join(",", var.nomad_datacenters)
+
+ alertmanager_url = join("",
+ [
+ "https://github.com",
+ "/prometheus/alertmanager/releases/download/",
+ "v${var.alertmanager_version}/",
+ "alertmanager-${var.alertmanager_version}.linux-amd64.tar.gz"
+ ]
+ )
+}
+
+data "template_file" "nomad_job_alertmanager" {
+ template = file("${path.module}/conf/nomad/alertmanager.hcl")
+ vars = {
+ datacenters = local.datacenters
+ url = local.alertmanager_url
+ job_name = var.alertmanager_job_name
+ use_canary = var.alertmanager_use_canary
+ group_count = var.alertmanager_group_count
+ service_name = var.alertmanager_service_name
+ use_vault_provider = var.alertmanager_vault_secret.use_vault_provider
+ version = var.alertmanager_version
+ cpu = var.alertmanager_cpu
+ mem = var.alertmanager_mem
+ port = var.alertmanager_port
+ slack_api_url = var.alertmanager_slack_api_url
+ slack_channel = var.alertmanager_slack_channel
+ default_receiver = var.alertmanager_default_receiver
+ }
+}
+
+resource "nomad_job" "nomad_job_alertmanager" {
+ jobspec = data.template_file.nomad_job_alertmanager.rendered
+ detach = false
+} \ No newline at end of file
diff --git a/terraform-ci-infra/1n_nmd/alertmanager/variables.tf b/terraform-ci-infra/1n_nmd/alertmanager/variables.tf
new file mode 100644
index 0000000000..ebd862123d
--- /dev/null
+++ b/terraform-ci-infra/1n_nmd/alertmanager/variables.tf
@@ -0,0 +1,84 @@
+# Nomad
+variable "nomad_datacenters" {
+ description = "Nomad data centers"
+ type = list(string)
+ default = [ "dc1" ]
+}
+
+# Alermanager
+variable "alertmanager_job_name" {
+ description = "Job name"
+ type = string
+ default = "alertmanager"
+}
+
+variable "alertmanager_group_count" {
+ description = "Number of group instances"
+ type = number
+ default = 1
+}
+
+variable "alertmanager_service_name" {
+ description = "Service name"
+ type = string
+ default = "alertmanager"
+}
+
+variable "alertmanager_version" {
+ description = "Version"
+ type = string
+ default = "0.21.0"
+}
+
+variable "alertmanager_use_canary" {
+ description = "Uses canary deployment"
+ type = bool
+ default = false
+}
+
+variable "alertmanager_vault_secret" {
+ description = "Set of properties to be able to fetch secret from vault"
+ type = object({
+ use_vault_provider = bool,
+ vault_kv_policy_name = string,
+ vault_kv_path = string,
+ vault_kv_field_access_key = string,
+ vault_kv_field_secret_key = string
+ })
+}
+
+variable "alertmanager_cpu" {
+ description = "CPU allocation"
+ type = number
+ default = 1000
+}
+
+variable "alertmanager_mem" {
+ description = "RAM allocation"
+ type = number
+ default = 1024
+}
+
+variable "alertmanager_port" {
+ description = "TCP allocation"
+ type = number
+ default = 9093
+}
+
+variable "alertmanager_default_receiver" {
+ description = "Alertmanager default receiver"
+ type = string
+ default = "default-receiver"
+}
+
+variable "alertmanager_slack_api_url" {
+ description = "Alertmanager slack API URL"
+ type = string
+ default = "https://hooks.slack.com/services/XXXXXXXXX/XXXXXXXXXXX/XXXXXXXXXXXXXXXXXXXXXXXX"
+}
+
+variable "alertmanager_slack_channel" {
+ description = "Alertmanager slack channel"
+ type = string
+ default = "slack-channel"
+} \ No newline at end of file
diff --git a/terraform-ci-infra/1n_nmd/exporter/conf/nomad/exporter.hcl b/terraform-ci-infra/1n_nmd/exporter/conf/nomad/exporter.hcl
new file mode 100644
index 0000000000..4fd0768ae7
--- /dev/null
+++ b/terraform-ci-infra/1n_nmd/exporter/conf/nomad/exporter.hcl
@@ -0,0 +1,587 @@
+job "${job_name}" {
+ # The "region" parameter specifies the region in which to execute the job.
+ # If omitted, this inherits the default region name of "global".
+ # region = "global"
+ #
+ # The "datacenters" parameter specifies the list of datacenters which should
+ # be considered when placing this task. This must be provided.
+ datacenters = "${datacenters}"
+
+ # The "type" parameter controls the type of job, which impacts the scheduler's
+ # decision on placement. This configuration is optional and defaults to
+ # "service". For a full list of job types and their differences, please see
+ # the online documentation.
+ #
+ # https://www.nomadproject.io/docs/jobspec/schedulers
+ #
+ type = "system"
+
+ update {
+ # The "max_parallel" parameter specifies the maximum number of updates to
+ # perform in parallel. In this case, this specifies to update a single task
+ # at a time.
+ max_parallel = 1
+
+ health_check = "checks"
+
+ # The "min_healthy_time" parameter specifies the minimum time the allocation
+ # must be in the healthy state before it is marked as healthy and unblocks
+ # further allocations from being updated.
+ min_healthy_time = "10s"
+
+ # The "healthy_deadline" parameter specifies the deadline in which the
+ # allocation must be marked as healthy after which the allocation is
+ # automatically transitioned to unhealthy. Transitioning to unhealthy will
+ # fail the deployment and potentially roll back the job if "auto_revert" is
+ # set to true.
+ healthy_deadline = "3m"
+
+ # The "progress_deadline" parameter specifies the deadline in which an
+ # allocation must be marked as healthy. The deadline begins when the first
+ # allocation for the deployment is created and is reset whenever an allocation
+ # as part of the deployment transitions to a healthy state. If no allocation
+ # transitions to the healthy state before the progress deadline, the
+ # deployment is marked as failed.
+ progress_deadline = "10m"
+
+%{ if use_canary }
+ # The "canary" parameter specifies that changes to the job that would result
+ # in destructive updates should create the specified number of canaries
+ # without stopping any previous allocations. Once the operator determines the
+ # canaries are healthy, they can be promoted which unblocks a rolling update
+ # of the remaining allocations at a rate of "max_parallel".
+ #
+ # Further, setting "canary" equal to the count of the task group allows
+ # blue/green deployments. When the job is updated, a full set of the new
+ # version is deployed and upon promotion the old version is stopped.
+ canary = 1
+
+ # Specifies if the job should auto-promote to the canary version when all
+ # canaries become healthy during a deployment. Defaults to false which means
+ # canaries must be manually updated with the nomad deployment promote
+ # command.
+ auto_promote = true
+
+ # The "auto_revert" parameter specifies if the job should auto-revert to the
+ # last stable job on deployment failure. A job is marked as stable if all the
+ # allocations as part of its deployment were marked healthy.
+ auto_revert = true
+%{ endif }
+ }
+
+ # The "group" stanza defines a series of tasks that should be co-located on
+ # the same Nomad client. Any task within a group will be placed on the same
+ # client.
+ #
+ # https://www.nomadproject.io/docs/job-specification/group
+ #
+ group "prod-group1-exporter-amd64" {
+ # The constraint allows restricting the set of eligible nodes. Constraints
+ # may filter on attributes or client metadata.
+ #
+ # https://www.nomadproject.io/docs/job-specification/constraint
+ #
+ constraint {
+ attribute = "$${attr.cpu.arch}"
+ operator = "!="
+ value = "arm64"
+ }
+
+ # The "task" stanza creates an individual unit of work, such as a Docker
+ # container, web application, or batch processing.
+ #
+ # https://www.nomadproject.io/docs/job-specification/task
+ #
+ task "prod-task1-${node_service_name}-amd64" {
+ # The "driver" parameter specifies the task driver that should be used to
+ # run the task.
+ driver = "raw_exec"
+
+ # The "config" stanza specifies the driver configuration, which is passed
+ # directly to the driver to start the task. The details of configurations
+ # are specific to each driver, so please see specific driver
+ # documentation for more information.
+ config {
+ command = "local/node_exporter-${node_version}.linux-amd64/node_exporter"
+ }
+
+ # The artifact stanza instructs Nomad to fetch and unpack a remote resource,
+ # such as a file, tarball, or binary. Nomad downloads artifacts using the
+ # popular go-getter library, which permits downloading artifacts from a
+ # variety of locations using a URL as the input source.
+ #
+ # https://www.nomadproject.io/docs/job-specification/artifact
+ #
+ artifact {
+ source = "${node_url_amd64}"
+ }
+
+ # The service stanza instructs Nomad to register a service with Consul.
+ #
+ # https://www.nomadproject.io/docs/job-specification/service
+ #
+ service {
+ name = "${node_service_name}"
+ port = "${node_service_name}"
+ check {
+ name = "Node Exporter Check Live"
+ type = "http"
+ path = "/metrics"
+ interval = "10s"
+ timeout = "2s"
+ }
+ }
+
+ # The "resources" stanza describes the requirements a task needs to
+ # execute. Resource requirements include memory, network, cpu, and more.
+ # This ensures the task will execute on a machine that contains enough
+ # resource capacity.
+ #
+ # https://www.nomadproject.io/docs/job-specification/resources
+ #
+ resources {
+ cpu = 500
+ # The network stanza specifies the networking requirements for the task
+ # group, including the network mode and port allocations. When scheduling
+ # jobs in Nomad they are provisioned across your fleet of machines along
+ # with other jobs and services. Because you don't know in advance what host
+ # your job will be provisioned on, Nomad will provide your tasks with
+ # network configuration when they start up.
+ #
+ # https://www.nomadproject.io/docs/job-specification/network
+ #
+ network {
+ port "${node_service_name}" {
+ static = ${node_port}
+ }
+ }
+ }
+ }
+ task "prod-task2-${blackbox_service_name}-amd64" {
+ # The "driver" parameter specifies the task driver that should be used to
+ # run the task.
+ driver = "exec"
+
+ # The "config" stanza specifies the driver configuration, which is passed
+ # directly to the driver to start the task. The details of configurations
+ # are specific to each driver, so please see specific driver
+ # documentation for more information.
+ config {
+ command = "local/blackbox_exporter-${blackbox_version}.linux-amd64/blackbox_exporter"
+ args = [
+ "--config.file=secrets/blackbox.yml"
+ ]
+ }
+
+ # The "template" stanza instructs Nomad to manage a template, such as
+ # a configuration file or script. This template can optionally pull data
+ # from Consul or Vault to populate runtime configuration data.
+ #
+ # https://www.nomadproject.io/docs/job-specification/template
+ #
+ template {
+ change_mode = "noop"
+ change_signal = "SIGINT"
+ destination = "secrets/blackbox.yml"
+ data = <<EOH
+modules:
+ http_2xx:
+ prober: http
+ timeout: 5s
+ http:
+ valid_http_versions: ["HTTP/1.1", "HTTP/2.0"]
+ no_follow_redirects: false
+ fail_if_ssl: false
+ fail_if_not_ssl: true
+ tls_config:
+ insecure_skip_verify: false
+ preferred_ip_protocol: "ip4"
+ icmp_v4:
+ prober: icmp
+ timeout: 5s
+ icmp:
+ preferred_ip_protocol: "ip4"
+ dns_udp:
+ prober: dns
+ timeout: 5s
+ dns:
+ query_name: "jenkins.fd.io"
+ query_type: "A"
+ valid_rcodes:
+ - NOERROR
+EOH
+ }
+
+ # The artifact stanza instructs Nomad to fetch and unpack a remote resource,
+ # such as a file, tarball, or binary. Nomad downloads artifacts using the
+ # popular go-getter library, which permits downloading artifacts from a
+ # variety of locations using a URL as the input source.
+ #
+ # https://www.nomadproject.io/docs/job-specification/artifact
+ #
+ artifact {
+ source = "${blackbox_url_amd64}"
+ }
+
+ # The service stanza instructs Nomad to register a service with Consul.
+ #
+ # https://www.nomadproject.io/docs/job-specification/service
+ #
+ service {
+ name = "${blackbox_service_name}"
+ port = "${blackbox_service_name}"
+ tags = [ "${blackbox_service_name}$${NOMAD_ALLOC_INDEX}" ]
+ check {
+ name = "Blackbox Exporter Check Live"
+ type = "http"
+ path = "/metrics"
+ interval = "10s"
+ timeout = "2s"
+ }
+ }
+
+ # The "resources" stanza describes the requirements a task needs to
+ # execute. Resource requirements include memory, network, cpu, and more.
+ # This ensures the task will execute on a machine that contains enough
+ # resource capacity.
+ #
+ # https://www.nomadproject.io/docs/job-specification/resources
+ #
+ resources {
+ cpu = 500
+ # The network stanza specifies the networking requirements for the task
+ # group, including the network mode and port allocations. When scheduling
+ # jobs in Nomad they are provisioned across your fleet of machines along
+ # with other jobs and services. Because you don't know in advance what host
+ # your job will be provisioned on, Nomad will provide your tasks with
+ # network configuration when they start up.
+ #
+ # https://www.nomadproject.io/docs/job-specification/network
+ #
+ network {
+ port "${blackbox_service_name}" {
+ static = ${blackbox_port}
+ }
+ }
+ }
+ }
+
+ task "prod-task3-${cadvisor_service_name}-amd64" {
+ # The "driver" parameter specifies the task driver that should be used to
+ # run the task.
+ driver = "docker"
+
+ # The "config" stanza specifies the driver configuration, which is passed
+ # directly to the driver to start the task. The details of configurations
+ # are specific to each driver, so please see specific driver
+ # documentation for more information.
+ config {
+ image = "${cadvisor_image}"
+ volumes = [
+ "/:/rootfs:ro",
+ "/var/run:/var/run:rw",
+ "/sys:/sys:ro",
+ "/var/lib/docker/:/var/lib/docker:ro",
+ "/cgroup:/cgroup:ro"
+ ]
+ }
+
+ # The service stanza instructs Nomad to register a service with Consul.
+ #
+ # https://www.nomadproject.io/docs/job-specification/service
+ #
+ service {
+ name = "${cadvisor_service_name}"
+ port = "${cadvisor_service_name}"
+ check {
+ name = "cAdvisor Check Live"
+ type = "http"
+ path = "/metrics"
+ interval = "10s"
+ timeout = "2s"
+ }
+ }
+
+ # The "resources" stanza describes the requirements a task needs to
+ # execute. Resource requirements include memory, network, cpu, and more.
+ # This ensures the task will execute on a machine that contains enough
+ # resource capacity.
+ #
+ # https://www.nomadproject.io/docs/job-specification/resources
+ #
+ resources {
+ cpu = 500
+ # The network stanza specifies the networking requirements for the task
+ # group, including the network mode and port allocations. When scheduling
+ # jobs in Nomad they are provisioned across your fleet of machines along
+ # with other jobs and services. Because you don't know in advance what host
+ # your job will be provisioned on, Nomad will provide your tasks with
+ # network configuration when they start up.
+ #
+ # https://www.nomadproject.io/docs/job-specification/network
+ #
+ network {
+ port "${cadvisor_service_name}" {
+ static = ${cadvisor_port}
+ }
+ }
+ }
+ }
+ }
+
+ group "prod-group1-exporter-arm64" {
+ # The constraint allows restricting the set of eligible nodes. Constraints
+ # may filter on attributes or client metadata.
+ #
+ # https://www.nomadproject.io/docs/job-specification/constraint
+ #
+ constraint {
+ attribute = "$${attr.cpu.arch}"
+ operator = "=="
+ value = "arm64"
+ }
+
+ # The "task" stanza creates an individual unit of work, such as a Docker
+ # container, web application, or batch processing.
+ #
+ # https://www.nomadproject.io/docs/job-specification/task
+ #
+ task "prod-task1-${node_service_name}-arm64" {
+ # The "driver" parameter specifies the task driver that should be used to
+ # run the task.
+ driver = "raw_exec"
+
+ # The "config" stanza specifies the driver configuration, which is passed
+ # directly to the driver to start the task. The details of configurations
+ # are specific to each driver, so please see specific driver
+ # documentation for more information.
+ config {
+ command = "local/node_exporter-${node_version}.linux-arm64/node_exporter"
+ }
+
+ # The artifact stanza instructs Nomad to fetch and unpack a remote resource,
+ # such as a file, tarball, or binary. Nomad downloads artifacts using the
+ # popular go-getter library, which permits downloading artifacts from a
+ # variety of locations using a URL as the input source.
+ #
+ # https://www.nomadproject.io/docs/job-specification/artifact
+ #
+ artifact {
+ source = "${node_url_arm64}"
+ }
+
+ # The service stanza instructs Nomad to register a service with Consul.
+ #
+ # https://www.nomadproject.io/docs/job-specification/service
+ #
+ service {
+ name = "${node_service_name}"
+ port = "${node_service_name}"
+ check {
+ name = "Node Exporter Check Live"
+ type = "http"
+ path = "/metrics"
+ interval = "10s"
+ timeout = "2s"
+ }
+ }
+
+ # The "resources" stanza describes the requirements a task needs to
+ # execute. Resource requirements include memory, network, cpu, and more.
+ # This ensures the task will execute on a machine that contains enough
+ # resource capacity.
+ #
+ # https://www.nomadproject.io/docs/job-specification/resources
+ #
+ resources {
+ cpu = 500
+ # The network stanza specifies the networking requirements for the task
+ # group, including the network mode and port allocations. When scheduling
+ # jobs in Nomad they are provisioned across your fleet of machines along
+ # with other jobs and services. Because you don't know in advance what host
+ # your job will be provisioned on, Nomad will provide your tasks with
+ # network configuration when they start up.
+ #
+ # https://www.nomadproject.io/docs/job-specification/network
+ #
+ network {
+ port "${node_service_name}" {
+ static = ${node_port}
+ }
+ }
+ }
+ }
+
+ task "prod-task2-${blackbox_service_name}-arm64" {
+ # The "driver" parameter specifies the task driver that should be used to
+ # run the task.
+ driver = "exec"
+
+ # The "config" stanza specifies the driver configuration, which is passed
+ # directly to the driver to start the task. The details of configurations
+ # are specific to each driver, so please see specific driver
+ # documentation for more information.
+ config {
+ command = "local/blackbox_exporter-${blackbox_version}.linux-arm64/blackbox_exporter"
+ args = [
+ "--config.file=secrets/blackbox.yml"
+ ]
+ }
+
+ # The "template" stanza instructs Nomad to manage a template, such as
+ # a configuration file or script. This template can optionally pull data
+ # from Consul or Vault to populate runtime configuration data.
+ #
+ # https://www.nomadproject.io/docs/job-specification/template
+ #
+ template {
+ change_mode = "noop"
+ change_signal = "SIGINT"
+ destination = "secrets/blackbox.yml"
+ data = <<EOH
+modules:
+ http_2xx:
+ prober: http
+ timeout: 5s
+ http:
+ valid_http_versions: ["HTTP/1.1", "HTTP/2.0"]
+ no_follow_redirects: false
+ fail_if_ssl: false
+ fail_if_not_ssl: true
+ tls_config:
+ insecure_skip_verify: false
+ preferred_ip_protocol: "ip4"
+ icmp_v4:
+ prober: icmp
+ timeout: 5s
+ icmp:
+ preferred_ip_protocol: "ip4"
+ dns_udp:
+ prober: dns
+ timeout: 5s
+ dns:
+ query_name: "jenkins.fd.io"
+ query_type: "A"
+ valid_rcodes:
+ - NOERROR
+EOH
+ }
+
+ # The artifact stanza instructs Nomad to fetch and unpack a remote resource,
+ # such as a file, tarball, or binary. Nomad downloads artifacts using the
+ # popular go-getter library, which permits downloading artifacts from a
+ # variety of locations using a URL as the input source.
+ #
+ # https://www.nomadproject.io/docs/job-specification/artifact
+ #
+ artifact {
+ source = "${blackbox_url_arm64}"
+ }
+
+ # The service stanza instructs Nomad to register a service with Consul.
+ #
+ # https://www.nomadproject.io/docs/job-specification/service
+ #
+ service {
+ name = "${blackbox_service_name}"
+ port = "${blackbox_service_name}"
+ tags = [ "${blackbox_service_name}$${NOMAD_ALLOC_INDEX}" ]
+ check {
+ name = "Blackbox Exporter Check Live"
+ type = "http"
+ path = "/metrics"
+ interval = "10s"
+ timeout = "2s"
+ }
+ }
+
+ # The "resources" stanza describes the requirements a task needs to
+ # execute. Resource requirements include memory, network, cpu, and more.
+ # This ensures the task will execute on a machine that contains enough
+ # resource capacity.
+ #
+ # https://www.nomadproject.io/docs/job-specification/resources
+ #
+ resources {
+ cpu = 500
+ # The network stanza specifies the networking requirements for the task
+ # group, including the network mode and port allocations. When scheduling
+ # jobs in Nomad they are provisioned across your fleet of machines along
+ # with other jobs and services. Because you don't know in advance what host
+ # your job will be provisioned on, Nomad will provide your tasks with
+ # network configuration when they start up.
+ #
+ # https://www.nomadproject.io/docs/job-specification/network
+ #
+ network {
+ port "${blackbox_service_name}" {
+ static = ${blackbox_port}
+ }
+ }
+ }
+ }
+
+ task "prod-task3-${cadvisor_service_name}-arm64" {
+ # The "driver" parameter specifies the task driver that should be used to
+ # run the task.
+ driver = "docker"
+
+ # The "config" stanza specifies the driver configuration, which is passed
+ # directly to the driver to start the task. The details of configurations
+ # are specific to each driver, so please see specific driver
+ # documentation for more information.
+ config {
+ # There is currently no official release for arm yet...using community.
+ image = "zcube/cadvisor:latest"
+ volumes = [
+ "/:/rootfs:ro",
+ "/var/run:/var/run:rw",
+ "/sys:/sys:ro",
+ "/var/lib/docker/:/var/lib/docker:ro",
+ "/cgroup:/cgroup:ro"
+ ]
+ }
+
+ # The service stanza instructs Nomad to register a service with Consul.
+ #
+ # https://www.nomadproject.io/docs/job-specification/service
+ #
+ service {
+ name = "${cadvisor_service_name}"
+ port = "${cadvisor_service_name}"
+ check {
+ name = "cAdvisor Check Live"
+ type = "http"
+ path = "/metrics"
+ interval = "10s"
+ timeout = "2s"
+ }
+ }
+
+ # The "resources" stanza describes the requirements a task needs to
+ # execute. Resource requirements include memory, network, cpu, and more.
+ # This ensures the task will execute on a machine that contains enough
+ # resource capacity.
+ #
+ # https://www.nomadproject.io/docs/job-specification/resources
+ #
+ resources {
+ cpu = 500
+ # The network stanza specifies the networking requirements for the task
+ # group, including the network mode and port allocations. When scheduling
+ # jobs in Nomad they are provisioned across your fleet of machines along
+ # with other jobs and services. Because you don't know in advance what host
+ # your job will be provisioned on, Nomad will provide your tasks with
+ # network configuration when they start up.
+ #
+ # https://www.nomadproject.io/docs/job-specification/network
+ #
+ network {
+ port "${cadvisor_service_name}" {
+ static = ${cadvisor_port}
+ }
+ }
+ }
+ }
+ }
+} \ No newline at end of file
diff --git a/terraform-ci-infra/1n_nmd/exporter/main.tf b/terraform-ci-infra/1n_nmd/exporter/main.tf
new file mode 100644
index 0000000000..35eb95b071
--- /dev/null
+++ b/terraform-ci-infra/1n_nmd/exporter/main.tf
@@ -0,0 +1,64 @@
+locals {
+ datacenters = join(",", var.nomad_datacenters)
+
+ node_url_amd64 = join("",
+ [
+ "https://github.com",
+ "/prometheus/node_exporter/releases/download/",
+ "v${var.node_version}/",
+ "node_exporter-${var.node_version}.linux-amd64.tar.gz"
+ ]
+ )
+ node_url_arm64 = join("",
+ [
+ "https://github.com",
+ "/prometheus/node_exporter/releases/download/",
+ "v${var.node_version}/",
+ "node_exporter-${var.node_version}.linux-arm64.tar.gz"
+ ]
+ )
+
+ blackbox_url_amd64 = join("",
+ [
+ "https://github.com",
+ "/prometheus/blackbox_exporter/releases/download/",
+ "v${var.blackbox_version}/",
+ "blackbox_exporter-${var.blackbox_version}.linux-amd64.tar.gz"
+ ]
+ )
+ blackbox_url_arm64 = join("",
+ [
+ "https://github.com",
+ "/prometheus/blackbox_exporter/releases/download/",
+ "v${var.blackbox_version}/",
+ "blackbox_exporter-${var.blackbox_version}.linux-arm64.tar.gz"
+ ]
+ )
+}
+
+data "template_file" "nomad_job_exporter" {
+ template = file("${path.module}/conf/nomad/exporter.hcl")
+ vars = {
+ datacenters = local.datacenters
+ job_name = var.exporter_job_name
+ use_canary = var.exporter_use_canary
+ node_url_amd64 = local.node_url_amd64
+ node_url_arm64 = local.node_url_arm64
+ node_version = var.node_version
+ node_service_name = var.node_service_name
+ node_port = var.node_port
+ blackbox_url_amd64 = local.blackbox_url_amd64
+ blackbox_url_arm64 = local.blackbox_url_arm64
+ blackbox_version = var.blackbox_version
+ blackbox_service_name = var.blackbox_service_name
+ blackbox_port = var.blackbox_port
+ cadvisor_image = var.cadvisor_image
+ cadvisor_service_name = var.cadvisor_service_name
+ cadvisor_port = var.cadvisor_port
+ }
+}
+
+resource "nomad_job" "nomad_job_exporter" {
+ jobspec = data.template_file.nomad_job_exporter.rendered
+ detach = false
+} \ No newline at end of file
diff --git a/terraform-ci-infra/1n_nmd/exporter/variables.tf b/terraform-ci-infra/1n_nmd/exporter/variables.tf
new file mode 100644
index 0000000000..bfa8bd37ac
--- /dev/null
+++ b/terraform-ci-infra/1n_nmd/exporter/variables.tf
@@ -0,0 +1,76 @@
+# Nomad
+variable "nomad_datacenters" {
+ description = "Nomad data centers"
+ type = list(string)
+ default = [ "dc1" ]
+}
+
+# Exporter
+variable "exporter_job_name" {
+ description = "Exporter job name"
+ type = string
+ default = "exporter"
+}
+
+variable "exporter_use_canary" {
+ description = "Uses canary deployment"
+ type = bool
+ default = false
+}
+
+# Node Exporter
+variable "node_service_name" {
+ description = "Node exporter service name"
+ type = string
+ default = "nodeexporter"
+}
+
+variable "node_version" {
+ description = "Node exporter version"
+ type = string
+ default = "1.0.1"
+}
+
+variable "node_port" {
+ description = "Node exporter TCP allocation"
+ type = number
+ default = 9100
+}
+
+# Blackbox Exporter
+variable "blackbox_service_name" {
+ description = "Blackbox exporter service name"
+ type = string
+ default = "blackboxexporter"
+}
+
+variable "blackbox_version" {
+ description = "Blackbox exporter version"
+ type = string
+ default = "0.18.0"
+}
+
+variable "blackbox_port" {
+ description = "Blackbox exporter TCP allocation"
+ type = number
+ default = 9115
+}
+
+# cAdvisor Exporter
+variable "cadvisor_service_name" {
+ description = "cAdvisor exporter service name"
+ type = string
+ default = "cadvisorexporter"
+}
+
+variable "cadvisor_image" {
+ description = "cAdvisor exporter docker image"
+ type = string
+ default = "gcr.io/cadvisor/cadvisor:v0.38.7"
+}
+
+variable "cadvisor_port" {
+ description = "cAdvisor exporter TCP allocation"
+ type = number
+ default = 8080
+} \ No newline at end of file
diff --git a/terraform-ci-infra/1n_nmd/grafana/conf/blackbox_exporter_http.json b/terraform-ci-infra/1n_nmd/grafana/conf/blackbox_exporter_http.json
new file mode 100644
index 0000000000..f9df1b239e
--- /dev/null
+++ b/terraform-ci-infra/1n_nmd/grafana/conf/blackbox_exporter_http.json
@@ -0,0 +1,1030 @@
+{
+ "__inputs": [
+ {
+ "name": "DS_PROMETHEUS",
+ "label": "signcl-prometheus",
+ "description": "",
+ "type": "datasource",
+ "pluginId": "prometheus",
+ "pluginName": "Prometheus"
+ }
+ ],
+ "__requires": [
+ {
+ "type": "grafana",
+ "id": "grafana",
+ "name": "Grafana",
+ "version": "5.2.2"
+ },
+ {
+ "type": "panel",
+ "id": "graph",
+ "name": "Graph",
+ "version": "5.0.0"
+ },
+ {
+ "type": "datasource",
+ "id": "prometheus",
+ "name": "Prometheus",
+ "version": "5.0.0"
+ },
+ {
+ "type": "panel",
+ "id": "singlestat",
+ "name": "Singlestat",
+ "version": "5.0.0"
+ }
+ ],
+ "annotations": {
+ "list": [
+ {
+ "builtIn": 1,
+ "datasource": "-- Grafana --",
+ "enable": true,
+ "hide": true,
+ "iconColor": "rgba(0, 211, 255, 1)",
+ "name": "Annotations & Alerts",
+ "type": "dashboard"
+ }
+ ]
+ },
+ "description": "Prometheus Blackbox Exporter Overview",
+ "editable": true,
+ "gnetId": 7587,
+ "graphTooltip": 0,
+ "id": null,
+ "iteration": 1534695504413,
+ "links": [],
+ "panels": [
+ {
+ "aliasColors": {},
+ "bars": false,
+ "dashLength": 10,
+ "dashes": false,
+ "datasource": "${DS_PROMETHEUS}",
+ "fill": 1,
+ "gridPos": {
+ "h": 8,
+ "w": 24,
+ "x": 0,
+ "y": 0
+ },
+ "id": 138,
+ "legend": {
+ "avg": false,
+ "current": false,
+ "max": false,
+ "min": false,
+ "show": true,
+ "total": false,
+ "values": false
+ },
+ "lines": true,
+ "linewidth": 1,
+ "links": [],
+ "nullPointMode": "null",
+ "percentage": false,
+ "pointradius": 5,
+ "points": false,
+ "renderer": "flot",
+ "seriesOverrides": [],
+ "spaceLength": 10,
+ "stack": false,
+ "steppedLine": false,
+ "targets": [
+ {
+ "expr": "probe_duration_seconds{instance=~\"$target\"}",
+ "format": "time_series",
+ "interval": "$interval",
+ "intervalFactor": 1,
+ "legendFormat": "{{ instance }}",
+ "refId": "A"
+ }
+ ],
+ "thresholds": [],
+ "timeFrom": null,
+ "timeShift": null,
+ "title": "Global Probe Duration",
+ "tooltip": {
+ "shared": true,
+ "sort": 1,
+ "value_type": "individual"
+ },
+ "type": "graph",
+ "xaxis": {
+ "buckets": null,
+ "mode": "time",
+ "name": null,
+ "show": true,
+ "values": []
+ },
+ "yaxes": [
+ {
+ "format": "s",
+ "label": null,
+ "logBase": 1,
+ "max": null,
+ "min": null,
+ "show": true
+ },
+ {
+ "format": "short",
+ "label": null,
+ "logBase": 1,
+ "max": null,
+ "min": null,
+ "show": true
+ }
+ ],
+ "yaxis": {
+ "align": false,
+ "alignLevel": null
+ }
+ },
+ {
+ "collapsed": false,
+ "gridPos": {
+ "h": 1,
+ "w": 24,
+ "x": 0,
+ "y": 8
+ },
+ "id": 15,
+ "panels": [],
+ "repeat": "target",
+ "title": "$target status",
+ "type": "row"
+ },
+ {
+ "aliasColors": {},
+ "bars": false,
+ "dashLength": 10,
+ "dashes": false,
+ "datasource": "${DS_PROMETHEUS}",
+ "fill": 1,
+ "gridPos": {
+ "h": 6,
+ "w": 10,
+ "x": 4,
+ "y": 9
+ },
+ "id": 25,
+ "legend": {
+ "avg": false,
+ "current": false,
+ "max": false,
+ "min": false,
+ "show": true,
+ "total": false,
+ "values": false
+ },
+ "lines": true,
+ "linewidth": 1,
+ "links": [],
+ "nullPointMode": "null",
+ "percentage": false,
+ "pointradius": 5,
+ "points": false,
+ "renderer": "flot",
+ "seriesOverrides": [],
+ "spaceLength": 10,
+ "stack": false,
+ "steppedLine": false,
+ "targets": [
+ {
+ "expr": "probe_http_duration_seconds{instance=~\"$target\"}",
+ "format": "time_series",
+ "interval": "$interval",
+ "intervalFactor": 1,
+ "legendFormat": "{{ phase }}",
+ "refId": "B"
+ }
+ ],
+ "thresholds": [],
+ "timeFrom": null,
+ "timeShift": null,
+ "title": "HTTP Duration",
+ "tooltip": {
+ "shared": true,
+ "sort": 0,
+ "value_type": "individual"
+ },
+ "type": "graph",
+ "xaxis": {
+ "buckets": null,
+ "mode": "time",
+ "name": null,
+ "show": true,
+ "values": []
+ },
+ "yaxes": [
+ {
+ "format": "s",
+ "label": null,
+ "logBase": 1,
+ "max": null,
+ "min": null,
+ "show": true
+ },
+ {
+ "format": "short",
+ "label": null,
+ "logBase": 1,
+ "max": null,
+ "min": null,
+ "show": true
+ }
+ ],
+ "yaxis": {
+ "align": false,
+ "alignLevel": null
+ }
+ },
+ {
+ "aliasColors": {},
+ "bars": false,
+ "dashLength": 10,
+ "dashes": false,
+ "datasource": "${DS_PROMETHEUS}",
+ "fill": 1,
+ "gridPos": {
+ "h": 6,
+ "w": 10,
+ "x": 14,
+ "y": 9
+ },
+ "id": 17,
+ "legend": {
+ "avg": false,
+ "current": false,
+ "max": false,
+ "min": false,
+ "show": true,
+ "total": false,
+ "values": false
+ },
+ "lines": true,
+ "linewidth": 1,
+ "links": [],
+ "nullPointMode": "null",
+ "percentage": false,
+ "pointradius": 5,
+ "points": false,
+ "renderer": "flot",
+ "repeat": null,
+ "seriesOverrides": [],
+ "spaceLength": 10,
+ "stack": false,
+ "steppedLine": false,
+ "targets": [
+ {
+ "expr": "probe_duration_seconds{instance=~\"$target\"}",
+ "format": "time_series",
+ "interval": "$interval",
+ "intervalFactor": 1,
+ "legendFormat": "seconds",
+ "refId": "A"
+ }
+ ],
+ "thresholds": [],
+ "timeFrom": null,
+ "timeShift": null,
+ "title": "Probe Duration",
+ "tooltip": {
+ "shared": true,
+ "sort": 0,
+ "value_type": "individual"
+ },
+ "type": "graph",
+ "xaxis": {
+ "buckets": null,
+ "mode": "time",
+ "name": null,
+ "show": true,
+ "values": []
+ },
+ "yaxes": [
+ {
+ "format": "s",
+ "label": null,
+ "logBase": 1,
+ "max": null,
+ "min": null,
+ "show": true
+ },
+ {
+ "format": "short",
+ "label": null,
+ "logBase": 1,
+ "max": null,
+ "min": null,
+ "show": true
+ }
+ ],
+ "yaxis": {
+ "align": false,
+ "alignLevel": null
+ }
+ },
+ {
+ "cacheTimeout": null,
+ "colorBackground": false,
+ "colorValue": false,
+ "colors": [
+ "#299c46",
+ "rgba(237, 129, 40, 0.89)",
+ "#d44a3a"
+ ],
+ "datasource": "${DS_PROMETHEUS}",
+ "decimals": 0,
+ "format": "none",
+ "gauge": {
+ "maxValue": 100,
+ "minValue": 0,
+ "show": false,
+ "thresholdLabels": false,
+ "thresholdMarkers": true
+ },
+ "gridPos": {
+ "h": 2,
+ "w": 4,
+ "x": 0,
+ "y": 11
+ },
+ "id": 20,
+ "interval": null,
+ "links": [],
+ "mappingType": 1,
+ "mappingTypes": [
+ {
+ "name": "value to text",
+ "value": 1
+ },
+ {
+ "name": "range to text",
+ "value": 2
+ }
+ ],
+ "maxDataPoints": 100,
+ "minSpan": 3,
+ "nullPointMode": "connected",
+ "nullText": null,
+ "postfix": "",
+ "postfixFontSize": "50%",
+ "prefix": "",
+ "prefixFontSize": "50%",
+ "rangeMaps": [
+ {
+ "from": "null",
+ "text": "N/A",
+ "to": "null"
+ }
+ ],
+ "repeat": null,
+ "repeatDirection": "h",
+ "sparkline": {
+ "fillColor": "rgba(31, 118, 189, 0.18)",
+ "full": false,
+ "lineColor": "rgb(31, 120, 193)",
+ "show": false
+ },
+ "tableColumn": "",
+ "targets": [
+ {
+ "expr": "probe_http_status_code{instance=~\"$target\"}",
+ "format": "time_series",
+ "interval": "$interval",
+ "intervalFactor": 1,
+ "refId": "A"
+ }
+ ],
+ "thresholds": "201, 399",
+ "title": "HTTP Status Code",
+ "transparent": false,
+ "type": "singlestat",
+ "valueFontSize": "80%",
+ "valueMaps": [
+ {
+ "op": "=",
+ "text": "N/A",
+ "value": "null"
+ },
+ {
+ "op": "=",
+ "text": "YES",
+ "value": "1"
+ },
+ {
+ "op": "=",
+ "text": "N/A",
+ "value": "0"
+ }
+ ],
+ "valueName": "current"
+ },
+ {
+ "cacheTimeout": null,
+ "colorBackground": false,
+ "colorValue": false,
+ "colors": [
+ "#299c46",
+ "rgba(237, 129, 40, 0.89)",
+ "#d44a3a"
+ ],
+ "datasource": "${DS_PROMETHEUS}",
+ "format": "none",
+ "gauge": {
+ "maxValue": 100,
+ "minValue": 0,
+ "show": false,
+ "thresholdLabels": false,
+ "thresholdMarkers": true
+ },
+ "gridPos": {
+ "h": 2,
+ "w": 4,
+ "x": 0,
+ "y": 13
+ },
+ "id": 27,
+ "interval": null,
+ "links": [],
+ "mappingType": 1,
+ "mappingTypes": [
+ {
+ "name": "value to text",
+ "value": 1
+ },
+ {
+ "name": "range to text",
+ "value": 2
+ }
+ ],
+ "maxDataPoints": 100,
+ "nullPointMode": "connected",
+ "nullText": null,
+ "postfix": "",
+ "postfixFontSize": "50%",
+ "prefix": "",
+ "prefixFontSize": "50%",
+ "rangeMaps": [
+ {
+ "from": "null",
+ "text": "N/A",
+ "to": "null"
+ }
+ ],
+ "sparkline": {
+ "fillColor": "rgba(31, 118, 189, 0.18)",
+ "full": false,
+ "lineColor": "rgb(31, 120, 193)",
+ "show": false
+ },
+ "tableColumn": "",
+ "targets": [
+ {
+ "expr": "probe_http_version{instance=~\"$target\"}",
+ "format": "time_series",
+ "intervalFactor": 1,
+ "refId": "A"
+ }
+ ],
+ "thresholds": "",
+ "title": "HTTP Version",
+ "type": "singlestat",
+ "valueFontSize": "80%",
+ "valueMaps": [
+ {
+ "op": "=",
+ "text": "N/A",
+ "value": "null"
+ }
+ ],
+ "valueName": "current"
+ },
+ {
+ "cacheTimeout": null,
+ "colorBackground": false,
+ "colorValue": true,
+ "colors": [
+ "#d44a3a",
+ "rgba(237, 129, 40, 0.89)",
+ "#299c46"
+ ],
+ "datasource": "${DS_PROMETHEUS}",
+ "format": "none",
+ "gauge": {
+ "maxValue": 100,
+ "minValue": 0,
+ "show": false,
+ "thresholdLabels": false,
+ "thresholdMarkers": true
+ },
+ "gridPos": {
+ "h": 2,
+ "w": 4,
+ "x": 0,
+ "y": 15
+ },
+ "id": 18,
+ "interval": null,
+ "links": [],
+ "mappingType": 1,
+ "mappingTypes": [
+ {
+ "name": "value to text",
+ "value": 1
+ },
+ {
+ "name": "range to text",
+ "value": 2
+ }
+ ],
+ "maxDataPoints": 100,
+ "minSpan": 3,
+ "nullPointMode": "connected",
+ "nullText": null,
+ "postfix": "",
+ "postfixFontSize": "50%",
+ "prefix": "",
+ "prefixFontSize": "50%",
+ "rangeMaps": [
+ {
+ "from": "null",
+ "text": "N/A",
+ "to": "null"
+ }
+ ],
+ "repeat": null,
+ "repeatDirection": "v",
+ "sparkline": {
+ "fillColor": "rgba(31, 118, 189, 0.18)",
+ "full": false,
+ "lineColor": "rgb(31, 120, 193)",
+ "show": false
+ },
+ "tableColumn": "",
+ "targets": [
+ {
+ "expr": "probe_http_ssl{instance=~\"$target\"}",
+ "format": "time_series",
+ "interval": "$interval",
+ "intervalFactor": 1,
+ "refId": "A"
+ }
+ ],
+ "thresholds": "0, 1",
+ "title": "SSL",
+ "type": "singlestat",
+ "valueFontSize": "80%",
+ "valueMaps": [
+ {
+ "op": "=",
+ "text": "N/A",
+ "value": "null"
+ },
+ {
+ "op": "=",
+ "text": "YES",
+ "value": "1"
+ },
+ {
+ "op": "=",
+ "text": "NO",
+ "value": "0"
+ }
+ ],
+ "valueName": "current"
+ },
+ {
+ "cacheTimeout": null,
+ "colorBackground": false,
+ "colorValue": true,
+ "colors": [
+ "#d44a3a",
+ "rgba(237, 129, 40, 0.89)",
+ "#299c46"
+ ],
+ "datasource": "${DS_PROMETHEUS}",
+ "decimals": 2,
+ "format": "dtdurations",
+ "gauge": {
+ "maxValue": 100,
+ "minValue": 0,
+ "show": false,
+ "thresholdLabels": false,
+ "thresholdMarkers": true
+ },
+ "gridPos": {
+ "h": 2,
+ "w": 10,
+ "x": 4,
+ "y": 15
+ },
+ "id": 19,
+ "interval": null,
+ "links": [],
+ "mappingType": 1,
+ "mappingTypes": [
+ {
+ "name": "value to text",
+ "value": 1
+ },
+ {
+ "name": "range to text",
+ "value": 2
+ }
+ ],
+ "maxDataPoints": 100,
+ "minSpan": 3,
+ "nullPointMode": "connected",
+ "nullText": null,
+ "postfix": "",
+ "postfixFontSize": "50%",
+ "prefix": "",
+ "prefixFontSize": "50%",
+ "rangeMaps": [
+ {
+ "from": "null",
+ "text": "N/A",
+ "to": "null"
+ }
+ ],
+ "repeat": null,
+ "repeatDirection": "h",
+ "sparkline": {
+ "fillColor": "rgba(31, 118, 189, 0.18)",
+ "full": false,
+ "lineColor": "rgb(31, 120, 193)",
+ "show": false
+ },
+ "tableColumn": "",
+ "targets": [
+ {
+ "expr": "probe_ssl_earliest_cert_expiry{instance=~\"$target\"} - time()",
+ "format": "time_series",
+ "interval": "$interval",
+ "intervalFactor": 1,
+ "refId": "A"
+ }
+ ],
+ "thresholds": "0,1209600",
+ "timeFrom": null,
+ "title": "SSL Expiry",
+ "transparent": false,
+ "type": "singlestat",
+ "valueFontSize": "80%",
+ "valueMaps": [
+ {
+ "op": "=",
+ "text": "N/A",
+ "value": "null"
+ },
+ {
+ "op": "=",
+ "text": "YES",
+ "value": "1"
+ },
+ {
+ "op": "=",
+ "text": "NO",
+ "value": "0"
+ }
+ ],
+ "valueName": "current"
+ },
+ {
+ "cacheTimeout": null,
+ "colorBackground": false,
+ "colorValue": false,
+ "colors": [
+ "#299c46",
+ "rgba(237, 129, 40, 0.89)",
+ "#d44a3a"
+ ],
+ "datasource": "${DS_PROMETHEUS}",
+ "format": "s",
+ "gauge": {
+ "maxValue": 100,
+ "minValue": 0,
+ "show": false,
+ "thresholdLabels": false,
+ "thresholdMarkers": true
+ },
+ "gridPos": {
+ "h": 2,
+ "w": 5,
+ "x": 14,
+ "y": 15
+ },
+ "id": 23,
+ "interval": null,
+ "links": [],
+ "mappingType": 1,
+ "mappingTypes": [
+ {
+ "name": "value to text",
+ "value": 1
+ },
+ {
+ "name": "range to text",
+ "value": 2
+ }
+ ],
+ "maxDataPoints": 100,
+ "nullPointMode": "connected",
+ "nullText": null,
+ "postfix": "",
+ "postfixFontSize": "50%",
+ "prefix": "",
+ "prefixFontSize": "50%",
+ "rangeMaps": [
+ {
+ "from": "null",
+ "text": "N/A",
+ "to": "null"
+ }
+ ],
+ "repeat": null,
+ "sparkline": {
+ "fillColor": "rgba(31, 118, 189, 0.18)",
+ "full": false,
+ "lineColor": "rgb(31, 120, 193)",
+ "show": false
+ },
+ "tableColumn": "",
+ "targets": [
+ {
+ "expr": "avg(probe_duration_seconds{instance=~\"$target\"})",
+ "format": "time_series",
+ "interval": "$interval",
+ "intervalFactor": 1,
+ "refId": "A"
+ }
+ ],
+ "thresholds": "",
+ "title": "Average Probe Duration",
+ "type": "singlestat",
+ "valueFontSize": "80%",
+ "valueMaps": [
+ {
+ "op": "=",
+ "text": "N/A",
+ "value": "null"
+ }
+ ],
+ "valueName": "current"
+ },
+ {
+ "cacheTimeout": null,
+ "colorBackground": false,
+ "colorValue": false,
+ "colors": [
+ "#299c46",
+ "rgba(237, 129, 40, 0.89)",
+ "#d44a3a"
+ ],
+ "datasource": "${DS_PROMETHEUS}",
+ "format": "s",
+ "gauge": {
+ "maxValue": 100,
+ "minValue": 0,
+ "show": false,
+ "thresholdLabels": false,
+ "thresholdMarkers": true
+ },
+ "gridPos": {
+ "h": 2,
+ "w": 5,
+ "x": 19,
+ "y": 15
+ },
+ "id": 24,
+ "interval": null,
+ "links": [],
+ "mappingType": 1,
+ "mappingTypes": [
+ {
+ "name": "value to text",
+ "value": 1
+ },
+ {
+ "name": "range to text",
+ "value": 2
+ }
+ ],
+ "maxDataPoints": 100,
+ "nullPointMode": "connected",
+ "nullText": null,
+ "postfix": "",
+ "postfixFontSize": "50%",
+ "prefix": "",
+ "prefixFontSize": "50%",
+ "rangeMaps": [
+ {
+ "from": "null",
+ "text": "N/A",
+ "to": "null"
+ }
+ ],
+ "repeat": null,
+ "repeatDirection": "h",
+ "sparkline": {
+ "fillColor": "rgba(31, 118, 189, 0.18)",
+ "full": false,
+ "lineColor": "rgb(31, 120, 193)",
+ "show": false
+ },
+ "tableColumn": "",
+ "targets": [
+ {
+ "expr": "avg(probe_dns_lookup_time_seconds{instance=~\"$target\"})",
+ "format": "time_series",
+ "interval": "$interval",
+ "intervalFactor": 1,
+ "refId": "A"
+ }
+ ],
+ "thresholds": "",
+ "title": "Average DNS Lookup",
+ "type": "singlestat",
+ "valueFontSize": "80%",
+ "valueMaps": [
+ {
+ "op": "=",
+ "text": "N/A",
+ "value": "null"
+ }
+ ],
+ "valueName": "current"
+ }
+ ],
+ "refresh": "10s",
+ "schemaVersion": 16,
+ "style": "dark",
+ "tags": [
+ "blackbox",
+ "prometheus"
+ ],
+ "templating": {
+ "list": [
+ {
+ "current": {
+ "selected": false,
+ "text": "default",
+ "value": "default"
+ },
+ "hide": 0,
+ "includeAll": false,
+ "label": "datasource",
+ "multi": false,
+ "name": "DS_PROMETHEUS",
+ "options": [],
+ "query": "prometheus",
+ "refresh": 1,
+ "regex": "",
+ "skipUrlSync": false,
+ "type": "datasource"
+ },
+ {
+ "auto": true,
+ "auto_count": 10,
+ "auto_min": "10s",
+ "current": {
+ "text": "10s",
+ "value": "10s"
+ },
+ "hide": 0,
+ "label": "Interval",
+ "name": "interval",
+ "options": [
+ {
+ "selected": false,
+ "text": "auto",
+ "value": "$__auto_interval_interval"
+ },
+ {
+ "selected": false,
+ "text": "5s",
+ "value": "5s"
+ },
+ {
+ "selected": true,
+ "text": "10s",
+ "value": "10s"
+ },
+ {
+ "selected": false,
+ "text": "30s",
+ "value": "30s"
+ },
+ {
+ "selected": false,
+ "text": "1m",
+ "value": "1m"
+ },
+ {
+ "selected": false,
+ "text": "10m",
+ "value": "10m"
+ },
+ {
+ "selected": false,
+ "text": "30m",
+ "value": "30m"
+ },
+ {
+ "selected": false,
+ "text": "1h",
+ "value": "1h"
+ },
+ {
+ "selected": false,
+ "text": "6h",
+ "value": "6h"
+ },
+ {
+ "selected": false,
+ "text": "12h",
+ "value": "12h"
+ },
+ {
+ "selected": false,
+ "text": "1d",
+ "value": "1d"
+ },
+ {
+ "selected": false,
+ "text": "7d",
+ "value": "7d"
+ },
+ {
+ "selected": false,
+ "text": "14d",
+ "value": "14d"
+ },
+ {
+ "selected": false,
+ "text": "30d",
+ "value": "30d"
+ }
+ ],
+ "query": "5s,10s,30s,1m,10m,30m,1h,6h,12h,1d,7d,14d,30d",
+ "refresh": 2,
+ "type": "interval"
+ },
+ {
+ "allValue": null,
+ "current": {},
+ "datasource": "${DS_PROMETHEUS}",
+ "hide": 0,
+ "includeAll": true,
+ "label": null,
+ "multi": true,
+ "name": "target",
+ "options": [],
+ "query": "label_values(probe_success, instance)",
+ "refresh": 1,
+ "regex": "",
+ "sort": 0,
+ "tagValuesQuery": "",
+ "tags": [],
+ "tagsQuery": "",
+ "type": "query",
+ "useTags": false
+ }
+ ]
+ },
+ "time": {
+ "from": "now-4h",
+ "to": "now"
+ },
+ "timepicker": {
+ "refresh_intervals": [
+ "5s",
+ "10s",
+ "30s",
+ "1m",
+ "5m",
+ "15m",
+ "30m",
+ "1h",
+ "2h",
+ "1d"
+ ],
+ "time_options": [
+ "5m",
+ "15m",
+ "1h",
+ "6h",
+ "12h",
+ "24h",
+ "2d",
+ "7d",
+ "30d"
+ ]
+ },
+ "timezone": "browser",
+ "title": "HTTP Exporter",
+ "version": 1
+} \ No newline at end of file
diff --git a/terraform-ci-infra/1n_nmd/grafana/conf/blackbox_exporter_icmp.json b/terraform-ci-infra/1n_nmd/grafana/conf/blackbox_exporter_icmp.json
new file mode 100644
index 0000000000..df30506348
--- /dev/null
+++ b/terraform-ci-infra/1n_nmd/grafana/conf/blackbox_exporter_icmp.json
@@ -0,0 +1,368 @@
+{
+ "__inputs": [
+ {
+ "name": "DS_PROMETHEUS",
+ "label": "localhost",
+ "description": "",
+ "type": "datasource",
+ "pluginId": "prometheus",
+ "pluginName": "Prometheus"
+ }
+ ],
+ "__requires": [
+ {
+ "type": "grafana",
+ "id": "grafana",
+ "name": "Grafana",
+ "version": "6.5.2"
+ },
+ {
+ "type": "panel",
+ "id": "graph",
+ "name": "Graph",
+ "version": ""
+ },
+ {
+ "type": "panel",
+ "id": "heatmap",
+ "name": "Heatmap",
+ "version": ""
+ },
+ {
+ "type": "datasource",
+ "id": "prometheus",
+ "name": "Prometheus",
+ "version": "1.0.0"
+ }
+ ],
+ "annotations": {
+ "list": [
+ {
+ "builtIn": 1,
+ "datasource": "-- Grafana --",
+ "enable": true,
+ "hide": true,
+ "iconColor": "rgba(0, 211, 255, 1)",
+ "name": "Annotations & Alerts",
+ "type": "dashboard"
+ }
+ ]
+ },
+ "editable": true,
+ "gnetId": 12412,
+ "graphTooltip": 0,
+ "id": null,
+ "iteration": 1591284149575,
+ "links": [],
+ "panels": [
+ {
+ "cards": {
+ "cardPadding": null,
+ "cardRound": null
+ },
+ "color": {
+ "cardColor": "#b4ff00",
+ "colorScale": "sqrt",
+ "colorScheme": "interpolateRdYlGn",
+ "exponent": 0.5,
+ "mode": "spectrum"
+ },
+ "dataFormat": "tsbuckets",
+ "datasource": "${DS_PROMETHEUS}",
+ "gridPos": {
+ "h": 8,
+ "w": 24,
+ "x": 0,
+ "y": 0
+ },
+ "heatmap": {},
+ "hideZeroBuckets": false,
+ "highlightCards": true,
+ "id": 7,
+ "legend": {
+ "show": true
+ },
+ "options": {},
+ "reverseYBuckets": true,
+ "targets": [
+ {
+ "expr": "sum(probe_icmp_duration_seconds{phase=\"rtt\"}) by (instance)",
+ "legendFormat": "{{instance}}",
+ "refId": "A"
+ }
+ ],
+ "timeFrom": null,
+ "timeShift": null,
+ "title": "ICMP RTT",
+ "tooltip": {
+ "show": true,
+ "showHistogram": true
+ },
+ "type": "heatmap",
+ "xAxis": {
+ "show": true
+ },
+ "xBucketNumber": null,
+ "xBucketSize": null,
+ "yAxis": {
+ "decimals": null,
+ "format": "s",
+ "logBase": 1,
+ "max": null,
+ "min": null,
+ "show": true,
+ "splitFactor": null
+ },
+ "yBucketBound": "middle",
+ "yBucketNumber": null,
+ "yBucketSize": null
+ },
+ {
+ "cards": {
+ "cardPadding": null,
+ "cardRound": null
+ },
+ "color": {
+ "cardColor": "#b4ff00",
+ "colorScale": "sqrt",
+ "colorScheme": "interpolateRdYlGn",
+ "exponent": 0.5,
+ "mode": "spectrum"
+ },
+ "dataFormat": "tsbuckets",
+ "datasource": "${DS_PROMETHEUS}",
+ "gridPos": {
+ "h": 8,
+ "w": 24,
+ "x": 0,
+ "y": 8
+ },
+ "heatmap": {},
+ "hideZeroBuckets": false,
+ "highlightCards": true,
+ "id": 8,
+ "legend": {
+ "show": true
+ },
+ "options": {},
+ "reverseYBuckets": true,
+ "targets": [
+ {
+ "expr": "1-avg_over_time(probe_success{instance=~\"$instance\"}[$__interval])",
+ "format": "time_series",
+ "hide": false,
+ "legendFormat": "{{instance}}",
+ "refId": "A"
+ }
+ ],
+ "timeFrom": null,
+ "timeShift": null,
+ "title": "ICMP packet loss",
+ "tooltip": {
+ "show": true,
+ "showHistogram": true
+ },
+ "type": "heatmap",
+ "xAxis": {
+ "show": true
+ },
+ "xBucketNumber": null,
+ "xBucketSize": null,
+ "yAxis": {
+ "decimals": null,
+ "format": "percentunit",
+ "logBase": 1,
+ "max": null,
+ "min": null,
+ "show": true,
+ "splitFactor": null
+ },
+ "yBucketBound": "middle",
+ "yBucketNumber": null,
+ "yBucketSize": null
+ },
+ {
+ "aliasColors": {},
+ "bars": false,
+ "dashLength": 10,
+ "dashes": false,
+ "datasource": "${DS_PROMETHEUS}",
+ "decimals": null,
+ "description": "This uses the blackbox exporter, which does not expose packet loss, for example. It could be improved with https://github.com/SuperQ/smokeping_prober because it also keeps track of lost samples (https://github.com/SuperQ/smokeping_prober/issues/24). Unfortunately, that still won't make graphs as nice as smokeping, because each probe only keeps one sample, instead of doing multiple like smokeping does (https://github.com/SuperQ/smokeping_prober/issues/36).",
+ "fill": 0,
+ "fillGradient": 0,
+ "gridPos": {
+ "h": 7,
+ "w": 24,
+ "x": 0,
+ "y": 16
+ },
+ "hiddenSeries": false,
+ "id": 2,
+ "legend": {
+ "alignAsTable": true,
+ "avg": true,
+ "current": true,
+ "hideEmpty": false,
+ "hideZero": false,
+ "max": true,
+ "min": true,
+ "rightSide": false,
+ "show": true,
+ "total": false,
+ "values": true
+ },
+ "lines": true,
+ "linewidth": 1,
+ "maxPerRow": 2,
+ "nullPointMode": "connected",
+ "options": {
+ "dataLinks": []
+ },
+ "percentage": false,
+ "pointradius": 0.5,
+ "points": false,
+ "renderer": "flot",
+ "repeat": "instance",
+ "repeatDirection": "v",
+ "seriesOverrides": [
+ {
+ "alias": "packet loss",
+ "color": "#C4162A",
+ "lines": false,
+ "pointradius": 1,
+ "points": true,
+ "yaxis": 2
+ }
+ ],
+ "spaceLength": 10,
+ "stack": false,
+ "steppedLine": true,
+ "targets": [
+ {
+ "expr": "sum(probe_icmp_duration_seconds{phase=\"rtt\",instance=~\"$instance\"}) by (instance) > 0",
+ "instant": false,
+ "legendFormat": "RTT",
+ "refId": "A"
+ },
+ {
+ "expr": "1-avg_over_time(probe_success{instance=~\"$instance\"}[$__interval])",
+ "format": "time_series",
+ "legendFormat": "packet loss",
+ "refId": "B"
+ }
+ ],
+ "thresholds": [],
+ "timeFrom": null,
+ "timeRegions": [],
+ "timeShift": null,
+ "title": "ICMP round trip time ($instance)",
+ "tooltip": {
+ "shared": true,
+ "sort": 1,
+ "value_type": "individual"
+ },
+ "type": "graph",
+ "xaxis": {
+ "buckets": null,
+ "mode": "time",
+ "name": null,
+ "show": true,
+ "values": []
+ },
+ "yaxes": [
+ {
+ "decimals": null,
+ "format": "dtdurations",
+ "label": "RTT",
+ "logBase": 1,
+ "max": null,
+ "min": "0",
+ "show": true
+ },
+ {
+ "format": "percentunit",
+ "label": "packet loss",
+ "logBase": 1,
+ "max": "1",
+ "min": "0.0001",
+ "show": true
+ }
+ ],
+ "yaxis": {
+ "align": false,
+ "alignLevel": null
+ }
+ }
+ ],
+ "refresh": false,
+ "schemaVersion": 21,
+ "style": "dark",
+ "tags": [],
+ "templating": {
+ "list": [
+ {
+ "current": {
+ "selected": false,
+ "text": "default",
+ "value": "default"
+ },
+ "hide": 0,
+ "includeAll": false,
+ "label": "datasource",
+ "multi": false,
+ "name": "DS_PROMETHEUS",
+ "options": [],
+ "query": "prometheus",
+ "refresh": 1,
+ "regex": "",
+ "skipUrlSync": false,
+ "type": "datasource"
+ },
+ {
+ "allValue": null,
+ "current": {},
+ "datasource": "${DS_PROMETHEUS}",
+ "definition": "label_values(probe_success, instance)",
+ "hide": 0,
+ "includeAll": true,
+ "label": null,
+ "multi": true,
+ "name": "instance",
+ "options": [],
+ "query": "label_values(probe_success, instance)",
+ "refresh": 2,
+ "regex": "",
+ "skipUrlSync": false,
+ "sort": 0,
+ "tagValuesQuery": "",
+ "tags": [],
+ "tagsQuery": "",
+ "type": "query",
+ "useTags": false
+ }
+ ]
+ },
+ "time": {
+ "from": "now-4h",
+ "to": "now"
+ },
+ "timepicker": {
+ "refresh_intervals": [
+ "5s",
+ "10s",
+ "30s",
+ "1m",
+ "5m",
+ "15m",
+ "30m",
+ "1h",
+ "2h",
+ "1d"
+ ]
+ },
+ "timezone": "",
+ "title": "ICMP exporter",
+ "version": 1,
+ "description": "Graph ICMP metrics from the blackbox exporter, Smokeping-style"
+} \ No newline at end of file
diff --git a/terraform-ci-infra/1n_nmd/grafana/conf/consul.json b/terraform-ci-infra/1n_nmd/grafana/conf/consul.json
new file mode 100644
index 0000000000..2e4a36f076
--- /dev/null
+++ b/terraform-ci-infra/1n_nmd/grafana/conf/consul.json
@@ -0,0 +1,1438 @@
+{
+ "__inputs": [
+ {
+ "name": "DS_PROMETHEUS",
+ "label": "Prometheus",
+ "description": "",
+ "type": "datasource",
+ "pluginId": "prometheus",
+ "pluginName": "Prometheus"
+ }
+ ],
+ "__requires": [
+ {
+ "type": "grafana",
+ "id": "grafana",
+ "name": "Grafana",
+ "version": "4.3.0-beta1"
+ },
+ {
+ "type": "panel",
+ "id": "graph",
+ "name": "Graph",
+ "version": ""
+ },
+ {
+ "type": "datasource",
+ "id": "prometheus",
+ "name": "Prometheus",
+ "version": "1.0.0"
+ },
+ {
+ "type": "panel",
+ "id": "singlestat",
+ "name": "Singlestat",
+ "version": ""
+ }
+ ],
+ "annotations": {
+ "list": []
+ },
+ "editable": true,
+ "gnetId": 2351,
+ "graphTooltip": 0,
+ "hideControls": false,
+ "id": null,
+ "links": [],
+ "rows": [
+ {
+ "collapse": false,
+ "height": 153,
+ "panels": [
+ {
+ "cacheTimeout": null,
+ "colorBackground": false,
+ "colorValue": false,
+ "colors": [
+ "rgba(245, 54, 54, 0.9)",
+ "rgba(237, 129, 40, 0.89)",
+ "rgba(50, 172, 45, 0.97)"
+ ],
+ "datasource": "${DS_PROMETHEUS}",
+ "format": "none",
+ "gauge": {
+ "maxValue": 100,
+ "minValue": 0,
+ "show": false,
+ "thresholdLabels": false,
+ "thresholdMarkers": true
+ },
+ "id": 1,
+ "interval": null,
+ "links": [],
+ "mappingType": 1,
+ "mappingTypes": [
+ {
+ "name": "value to text",
+ "value": 1
+ },
+ {
+ "name": "range to text",
+ "value": 2
+ }
+ ],
+ "maxDataPoints": 100,
+ "nullPointMode": "connected",
+ "nullText": null,
+ "postfix": "",
+ "postfixFontSize": "50%",
+ "prefix": "",
+ "prefixFontSize": "50%",
+ "rangeMaps": [
+ {
+ "from": "null",
+ "text": "N/A",
+ "to": "null"
+ }
+ ],
+ "span": 2,
+ "sparkline": {
+ "fillColor": "rgba(31, 118, 189, 0.18)",
+ "full": false,
+ "lineColor": "rgb(31, 120, 193)",
+ "show": false
+ },
+ "tableColumn": "",
+ "targets": [
+ {
+ "expr": "consul_raft_leader_lastcontact_count",
+ "format": "time_series",
+ "interval": "",
+ "intervalFactor": 2,
+ "legendFormat": "{{host}}",
+ "refId": "A",
+ "step": 60
+ }
+ ],
+ "thresholds": "",
+ "title": "Consul Leader",
+ "type": "singlestat",
+ "valueFontSize": "80%",
+ "valueMaps": [
+ {
+ "op": "=",
+ "text": "N/A",
+ "value": "null"
+ }
+ ],
+ "valueName": "name"
+ },
+ {
+ "cacheTimeout": null,
+ "colorBackground": false,
+ "colorValue": false,
+ "colors": [
+ "rgba(245, 54, 54, 0.9)",
+ "rgba(237, 129, 40, 0.89)",
+ "rgba(50, 172, 45, 0.97)"
+ ],
+ "datasource": "${DS_PROMETHEUS}",
+ "format": "none",
+ "gauge": {
+ "maxValue": 3,
+ "minValue": 0,
+ "show": true,
+ "thresholdLabels": false,
+ "thresholdMarkers": true
+ },
+ "id": 17,
+ "interval": null,
+ "links": [],
+ "mappingType": 1,
+ "mappingTypes": [
+ {
+ "name": "value to text",
+ "value": 1
+ },
+ {
+ "name": "range to text",
+ "value": 2
+ }
+ ],
+ "maxDataPoints": 100,
+ "nullPointMode": "connected",
+ "nullText": null,
+ "postfix": "",
+ "postfixFontSize": "50%",
+ "prefix": "",
+ "prefixFontSize": "50%",
+ "rangeMaps": [
+ {
+ "from": "null",
+ "text": "N/A",
+ "to": "null"
+ }
+ ],
+ "span": 2,
+ "sparkline": {
+ "fillColor": "rgba(31, 118, 189, 0.18)",
+ "full": false,
+ "lineColor": "rgb(31, 120, 193)",
+ "show": false
+ },
+ "tableColumn": "",
+ "targets": [
+ {
+ "expr": "COUNT (changes(consul_memberlist_gossep_sum[1m]) > 0) BY (labels)",
+ "format": "time_series",
+ "intervalFactor": 2,
+ "legendFormat": "",
+ "refId": "A",
+ "step": 60
+ }
+ ],
+ "thresholds": "1,2",
+ "title": "# servers in cluster",
+ "type": "singlestat",
+ "valueFontSize": "100%",
+ "valueMaps": [
+ {
+ "op": "=",
+ "text": "N/A",
+ "value": "null"
+ }
+ ],
+ "valueName": "current"
+ },
+ {
+ "cacheTimeout": null,
+ "colorBackground": false,
+ "colorValue": false,
+ "colors": [
+ "rgba(245, 54, 54, 0.9)",
+ "rgba(237, 129, 40, 0.89)",
+ "rgba(50, 172, 45, 0.97)"
+ ],
+ "datasource": "${DS_PROMETHEUS}",
+ "decimals": null,
+ "format": "percent",
+ "gauge": {
+ "maxValue": 100,
+ "minValue": 0,
+ "show": true,
+ "thresholdLabels": false,
+ "thresholdMarkers": true
+ },
+ "id": 18,
+ "interval": null,
+ "links": [],
+ "mappingType": 1,
+ "mappingTypes": [
+ {
+ "name": "value to text",
+ "value": 1
+ },
+ {
+ "name": "range to text",
+ "value": 2
+ }
+ ],
+ "maxDataPoints": 100,
+ "nullPointMode": "connected",
+ "nullText": null,
+ "postfix": "",
+ "postfixFontSize": "50%",
+ "prefix": "",
+ "prefixFontSize": "50%",
+ "rangeMaps": [
+ {
+ "from": "null",
+ "text": "N/A",
+ "to": "null"
+ }
+ ],
+ "span": 2,
+ "sparkline": {
+ "fillColor": "rgba(31, 118, 189, 0.18)",
+ "full": false,
+ "lineColor": "rgb(31, 120, 193)",
+ "show": false
+ },
+ "tableColumn": "",
+ "targets": [
+ {
+ "expr": "sum(irate(node_cpu{mode=\"idle\", host=\"$consul\"}[1m])) * 100 / count_scalar(node_cpu{mode=\"user\", host=\"$consul\"})",
+ "format": "time_series",
+ "intervalFactor": 2,
+ "legendFormat": "",
+ "refId": "A",
+ "step": 60
+ }
+ ],
+ "thresholds": "",
+ "title": "CPU Idle",
+ "type": "singlestat",
+ "valueFontSize": "80%",
+ "valueMaps": [
+ {
+ "op": "=",
+ "text": "N/A",
+ "value": "null"
+ }
+ ],
+ "valueName": "avg"
+ },
+ {
+ "cacheTimeout": null,
+ "colorBackground": false,
+ "colorValue": false,
+ "colors": [
+ "rgba(50, 172, 45, 0.97)",
+ "rgba(237, 129, 40, 0.89)",
+ "rgba(245, 54, 54, 0.9)"
+ ],
+ "datasource": "${DS_PROMETHEUS}",
+ "format": "none",
+ "gauge": {
+ "maxValue": 4,
+ "minValue": 0,
+ "show": true,
+ "thresholdLabels": false,
+ "thresholdMarkers": true
+ },
+ "id": 14,
+ "interval": null,
+ "links": [],
+ "mappingType": 1,
+ "mappingTypes": [
+ {
+ "name": "value to text",
+ "value": 1
+ },
+ {
+ "name": "range to text",
+ "value": 2
+ }
+ ],
+ "maxDataPoints": 100,
+ "nullPointMode": "connected",
+ "nullText": null,
+ "postfix": "",
+ "postfixFontSize": "50%",
+ "prefix": "",
+ "prefixFontSize": "50%",
+ "rangeMaps": [
+ {
+ "from": "null",
+ "text": "N/A",
+ "to": "null"
+ }
+ ],
+ "span": 2,
+ "sparkline": {
+ "fillColor": "rgba(31, 118, 189, 0.18)",
+ "full": false,
+ "lineColor": "rgb(31, 120, 193)",
+ "show": false
+ },
+ "tableColumn": "",
+ "targets": [
+ {
+ "expr": "node_load1{host=\"$consul\"}",
+ "format": "time_series",
+ "intervalFactor": 2,
+ "legendFormat": "",
+ "refId": "A",
+ "step": 60
+ }
+ ],
+ "thresholds": "1,2",
+ "title": "Load 1",
+ "type": "singlestat",
+ "valueFontSize": "80%",
+ "valueMaps": [
+ {
+ "op": "=",
+ "text": "N/A",
+ "value": "null"
+ }
+ ],
+ "valueName": "avg"
+ },
+ {
+ "cacheTimeout": null,
+ "colorBackground": false,
+ "colorValue": false,
+ "colors": [
+ "rgba(50, 172, 45, 0.97)",
+ "rgba(237, 129, 40, 0.89)",
+ "rgba(245, 54, 54, 0.9)"
+ ],
+ "datasource": "${DS_PROMETHEUS}",
+ "format": "none",
+ "gauge": {
+ "maxValue": 4,
+ "minValue": 0,
+ "show": true,
+ "thresholdLabels": false,
+ "thresholdMarkers": true
+ },
+ "id": 15,
+ "interval": null,
+ "links": [],
+ "mappingType": 1,
+ "mappingTypes": [
+ {
+ "name": "value to text",
+ "value": 1
+ },
+ {
+ "name": "range to text",
+ "value": 2
+ }
+ ],
+ "maxDataPoints": 100,
+ "nullPointMode": "connected",
+ "nullText": null,
+ "postfix": "",
+ "postfixFontSize": "50%",
+ "prefix": "",
+ "prefixFontSize": "50%",
+ "rangeMaps": [
+ {
+ "from": "null",
+ "text": "N/A",
+ "to": "null"
+ }
+ ],
+ "span": 2,
+ "sparkline": {
+ "fillColor": "rgba(31, 118, 189, 0.18)",
+ "full": false,
+ "lineColor": "rgb(31, 120, 193)",
+ "show": false
+ },
+ "tableColumn": "",
+ "targets": [
+ {
+ "expr": "node_load5{host=\"$consul\"}",
+ "format": "time_series",
+ "intervalFactor": 2,
+ "legendFormat": "",
+ "refId": "A",
+ "step": 60
+ }
+ ],
+ "thresholds": "1,2",
+ "title": "Load 5",
+ "type": "singlestat",
+ "valueFontSize": "80%",
+ "valueMaps": [
+ {
+ "op": "=",
+ "text": "N/A",
+ "value": "null"
+ }
+ ],
+ "valueName": "avg"
+ },
+ {
+ "cacheTimeout": null,
+ "colorBackground": false,
+ "colorValue": false,
+ "colors": [
+ "rgba(50, 172, 45, 0.97)",
+ "rgba(237, 129, 40, 0.89)",
+ "rgba(245, 54, 54, 0.9)"
+ ],
+ "datasource": "${DS_PROMETHEUS}",
+ "format": "none",
+ "gauge": {
+ "maxValue": 4,
+ "minValue": 0,
+ "show": true,
+ "thresholdLabels": false,
+ "thresholdMarkers": true
+ },
+ "id": 16,
+ "interval": null,
+ "links": [],
+ "mappingType": 1,
+ "mappingTypes": [
+ {
+ "name": "value to text",
+ "value": 1
+ },
+ {
+ "name": "range to text",
+ "value": 2
+ }
+ ],
+ "maxDataPoints": 100,
+ "nullPointMode": "connected",
+ "nullText": null,
+ "postfix": "",
+ "postfixFontSize": "50%",
+ "prefix": "",
+ "prefixFontSize": "50%",
+ "rangeMaps": [
+ {
+ "from": "null",
+ "text": "N/A",
+ "to": "null"
+ }
+ ],
+ "span": 2,
+ "sparkline": {
+ "fillColor": "rgba(31, 118, 189, 0.18)",
+ "full": false,
+ "lineColor": "rgb(31, 120, 193)",
+ "show": false
+ },
+ "tableColumn": "",
+ "targets": [
+ {
+ "expr": "node_load15{host=\"$consul\"}",
+ "format": "time_series",
+ "intervalFactor": 2,
+ "legendFormat": "",
+ "refId": "A",
+ "step": 60
+ }
+ ],
+ "thresholds": "1,2",
+ "title": "Load 15",
+ "type": "singlestat",
+ "valueFontSize": "80%",
+ "valueMaps": [
+ {
+ "op": "=",
+ "text": "N/A",
+ "value": "null"
+ }
+ ],
+ "valueName": "avg"
+ }
+ ],
+ "repeat": null,
+ "repeatIteration": null,
+ "repeatRowId": null,
+ "showTitle": false,
+ "title": "Dashboard Row",
+ "titleSize": "h6"
+ },
+ {
+ "collapse": false,
+ "height": 250,
+ "panels": [
+ {
+ "aliasColors": {},
+ "bars": false,
+ "dashLength": 10,
+ "dashes": false,
+ "datasource": "${DS_PROMETHEUS}",
+ "description": "The amount of TCP messages that are sent/received from the server.",
+ "fill": 1,
+ "id": 3,
+ "legend": {
+ "avg": false,
+ "current": false,
+ "max": false,
+ "min": false,
+ "show": true,
+ "total": false,
+ "values": false
+ },
+ "lines": true,
+ "linewidth": 1,
+ "links": [],
+ "nullPointMode": "null",
+ "percentage": false,
+ "pointradius": 5,
+ "points": false,
+ "renderer": "flot",
+ "seriesOverrides": [],
+ "spaceLength": 10,
+ "span": 6,
+ "stack": false,
+ "steppedLine": false,
+ "targets": [
+ {
+ "expr": "irate(consul_memberlist_tcp{host=\"$consul\"}[1m])",
+ "format": "time_series",
+ "interval": "",
+ "intervalFactor": 2,
+ "legendFormat": "{{type}}",
+ "refId": "A",
+ "step": 10
+ }
+ ],
+ "thresholds": [],
+ "timeFrom": null,
+ "timeShift": null,
+ "title": "Memberlist TCP Messages",
+ "tooltip": {
+ "shared": true,
+ "sort": 0,
+ "value_type": "individual"
+ },
+ "type": "graph",
+ "xaxis": {
+ "buckets": null,
+ "mode": "time",
+ "name": null,
+ "show": true,
+ "values": []
+ },
+ "yaxes": [
+ {
+ "format": "short",
+ "label": null,
+ "logBase": 1,
+ "max": null,
+ "min": null,
+ "show": true
+ },
+ {
+ "format": "short",
+ "label": null,
+ "logBase": 1,
+ "max": null,
+ "min": null,
+ "show": true
+ }
+ ]
+ },
+ {
+ "aliasColors": {},
+ "bars": false,
+ "dashLength": 10,
+ "dashes": false,
+ "datasource": "${DS_PROMETHEUS}",
+ "description": "The amount of UDP messages that are sent/received from the server.",
+ "fill": 1,
+ "id": 5,
+ "legend": {
+ "avg": false,
+ "current": false,
+ "max": false,
+ "min": false,
+ "show": true,
+ "total": false,
+ "values": false
+ },
+ "lines": true,
+ "linewidth": 1,
+ "links": [],
+ "nullPointMode": "null",
+ "percentage": false,
+ "pointradius": 5,
+ "points": false,
+ "renderer": "flot",
+ "seriesOverrides": [],
+ "spaceLength": 10,
+ "span": 6,
+ "stack": false,
+ "steppedLine": false,
+ "targets": [
+ {
+ "expr": "irate(consul_memberlist_udp{host=\"$consul\"}[1m])",
+ "format": "time_series",
+ "interval": "",
+ "intervalFactor": 2,
+ "legendFormat": "{{type}}",
+ "refId": "A",
+ "step": 10
+ }
+ ],
+ "thresholds": [],
+ "timeFrom": null,
+ "timeShift": null,
+ "title": "Memberlist UDP Messages",
+ "tooltip": {
+ "shared": true,
+ "sort": 0,
+ "value_type": "individual"
+ },
+ "type": "graph",
+ "xaxis": {
+ "buckets": null,
+ "mode": "time",
+ "name": null,
+ "show": true,
+ "values": []
+ },
+ "yaxes": [
+ {
+ "format": "none",
+ "label": null,
+ "logBase": 1,
+ "max": null,
+ "min": "0",
+ "show": true
+ },
+ {
+ "format": "short",
+ "label": null,
+ "logBase": 1,
+ "max": null,
+ "min": null,
+ "show": true
+ }
+ ]
+ }
+ ],
+ "repeat": null,
+ "repeatIteration": null,
+ "repeatRowId": null,
+ "showTitle": false,
+ "title": "Dashboard Row",
+ "titleSize": "h6"
+ },
+ {
+ "collapse": false,
+ "height": 250,
+ "panels": [
+ {
+ "aliasColors": {},
+ "bars": false,
+ "dashLength": 10,
+ "dashes": false,
+ "datasource": "${DS_PROMETHEUS}",
+ "description": "This measures the time it takes to replicate log entries to followers. This is a general indicator of the load pressure on the Consul servers, as well as the performance of the communication between the servers.",
+ "fill": 1,
+ "id": 6,
+ "legend": {
+ "avg": false,
+ "current": false,
+ "max": false,
+ "min": false,
+ "show": true,
+ "total": false,
+ "values": false
+ },
+ "lines": true,
+ "linewidth": 1,
+ "links": [],
+ "nullPointMode": "null",
+ "percentage": false,
+ "pointradius": 5,
+ "points": false,
+ "renderer": "flot",
+ "seriesOverrides": [],
+ "spaceLength": 10,
+ "span": 6,
+ "stack": false,
+ "steppedLine": false,
+ "targets": [
+ {
+ "expr": "consul_raft_replication_appendEntries_rpc",
+ "format": "time_series",
+ "interval": "",
+ "intervalFactor": 2,
+ "legendFormat": "{{query}} - {{quantile}}%",
+ "refId": "A",
+ "step": 10
+ }
+ ],
+ "thresholds": [],
+ "timeFrom": null,
+ "timeShift": null,
+ "title": "Log replication from leader to servers",
+ "tooltip": {
+ "shared": true,
+ "sort": 0,
+ "value_type": "individual"
+ },
+ "type": "graph",
+ "xaxis": {
+ "buckets": null,
+ "mode": "time",
+ "name": null,
+ "show": true,
+ "values": []
+ },
+ "yaxes": [
+ {
+ "format": "ms",
+ "label": null,
+ "logBase": 1,
+ "max": null,
+ "min": "0",
+ "show": true
+ },
+ {
+ "format": "short",
+ "label": null,
+ "logBase": 1,
+ "max": null,
+ "min": null,
+ "show": true
+ }
+ ]
+ },
+ {
+ "aliasColors": {},
+ "bars": false,
+ "dashLength": 10,
+ "dashes": false,
+ "datasource": "${DS_PROMETHEUS}",
+ "fill": 1,
+ "id": 7,
+ "legend": {
+ "avg": false,
+ "current": false,
+ "max": false,
+ "min": false,
+ "show": true,
+ "total": false,
+ "values": false
+ },
+ "lines": true,
+ "linewidth": 1,
+ "links": [],
+ "nullPointMode": "null",
+ "percentage": false,
+ "pointradius": 5,
+ "points": false,
+ "renderer": "flot",
+ "seriesOverrides": [],
+ "spaceLength": 10,
+ "span": 6,
+ "stack": false,
+ "steppedLine": false,
+ "targets": [
+ {
+ "expr": "consul_raft_replication_heartbeat",
+ "format": "time_series",
+ "interval": "",
+ "intervalFactor": 2,
+ "legendFormat": "{{query}} - {{quantile}}%",
+ "refId": "A",
+ "step": 10
+ }
+ ],
+ "thresholds": [],
+ "timeFrom": null,
+ "timeShift": null,
+ "title": "consul_raft_replication_heartbeat",
+ "tooltip": {
+ "shared": true,
+ "sort": 0,
+ "value_type": "individual"
+ },
+ "type": "graph",
+ "xaxis": {
+ "buckets": null,
+ "mode": "time",
+ "name": null,
+ "show": true,
+ "values": []
+ },
+ "yaxes": [
+ {
+ "format": "ms",
+ "label": null,
+ "logBase": 1,
+ "max": null,
+ "min": "0",
+ "show": true
+ },
+ {
+ "format": "short",
+ "label": null,
+ "logBase": 1,
+ "max": null,
+ "min": null,
+ "show": true
+ }
+ ]
+ }
+ ],
+ "repeat": null,
+ "repeatIteration": null,
+ "repeatRowId": null,
+ "showTitle": false,
+ "title": "Dashboard Row",
+ "titleSize": "h6"
+ },
+ {
+ "collapse": false,
+ "height": 250,
+ "panels": [
+ {
+ "aliasColors": {},
+ "bars": false,
+ "dashLength": 10,
+ "dashes": false,
+ "datasource": "${DS_PROMETHEUS}",
+ "description": "This measures the time it takes for the leader to write log entries to disk.",
+ "fill": 1,
+ "id": 8,
+ "legend": {
+ "avg": false,
+ "current": false,
+ "max": false,
+ "min": false,
+ "show": true,
+ "total": false,
+ "values": false
+ },
+ "lines": true,
+ "linewidth": 1,
+ "links": [],
+ "nullPointMode": "null",
+ "percentage": false,
+ "pointradius": 5,
+ "points": false,
+ "renderer": "flot",
+ "seriesOverrides": [],
+ "spaceLength": 10,
+ "span": 6,
+ "stack": false,
+ "steppedLine": false,
+ "targets": [
+ {
+ "expr": "consul_raft_leader_dispatchLog",
+ "format": "time_series",
+ "interval": "",
+ "intervalFactor": 2,
+ "legendFormat": "{{quantile}}%",
+ "refId": "A",
+ "step": 10
+ }
+ ],
+ "thresholds": [],
+ "timeFrom": null,
+ "timeShift": null,
+ "title": "Write logs",
+ "tooltip": {
+ "shared": true,
+ "sort": 0,
+ "value_type": "individual"
+ },
+ "type": "graph",
+ "xaxis": {
+ "buckets": null,
+ "mode": "time",
+ "name": null,
+ "show": true,
+ "values": []
+ },
+ "yaxes": [
+ {
+ "format": "ms",
+ "label": null,
+ "logBase": 1,
+ "max": null,
+ "min": null,
+ "show": true
+ },
+ {
+ "format": "short",
+ "label": null,
+ "logBase": 1,
+ "max": null,
+ "min": null,
+ "show": true
+ }
+ ]
+ },
+ {
+ "aliasColors": {},
+ "bars": false,
+ "dashLength": 10,
+ "dashes": false,
+ "datasource": "${DS_PROMETHEUS}",
+ "description": "This measures the time it takes to commit a new entry to the Raft log on the leader.",
+ "fill": 1,
+ "id": 4,
+ "legend": {
+ "avg": false,
+ "current": false,
+ "max": false,
+ "min": false,
+ "show": true,
+ "total": false,
+ "values": false
+ },
+ "lines": true,
+ "linewidth": 1,
+ "links": [],
+ "nullPointMode": "null",
+ "percentage": false,
+ "pointradius": 5,
+ "points": false,
+ "renderer": "flot",
+ "seriesOverrides": [],
+ "spaceLength": 10,
+ "span": 6,
+ "stack": false,
+ "steppedLine": false,
+ "targets": [
+ {
+ "expr": "consul_raft_commitTime",
+ "format": "time_series",
+ "interval": "",
+ "intervalFactor": 2,
+ "legendFormat": "{{quantile}}%",
+ "refId": "A",
+ "step": 10
+ }
+ ],
+ "thresholds": [],
+ "timeFrom": null,
+ "timeShift": null,
+ "title": "Commit time Leader",
+ "tooltip": {
+ "shared": true,
+ "sort": 0,
+ "value_type": "individual"
+ },
+ "type": "graph",
+ "xaxis": {
+ "buckets": null,
+ "mode": "time",
+ "name": null,
+ "show": true,
+ "values": []
+ },
+ "yaxes": [
+ {
+ "format": "ms",
+ "label": null,
+ "logBase": 1,
+ "max": null,
+ "min": "0",
+ "show": true
+ },
+ {
+ "format": "short",
+ "label": null,
+ "logBase": 1,
+ "max": null,
+ "min": null,
+ "show": true
+ }
+ ]
+ }
+ ],
+ "repeat": null,
+ "repeatIteration": null,
+ "repeatRowId": null,
+ "showTitle": false,
+ "title": "Dashboard Row",
+ "titleSize": "h6"
+ },
+ {
+ "collapse": false,
+ "height": 250,
+ "panels": [
+ {
+ "aliasColors": {},
+ "bars": false,
+ "dashLength": 10,
+ "dashes": false,
+ "datasource": "${DS_PROMETHEUS}",
+ "description": "This counts the number of Raft transactions occurring over the interval, which is a general indicator of the write load on the Consul servers.",
+ "fill": 1,
+ "id": 9,
+ "legend": {
+ "avg": false,
+ "current": false,
+ "max": false,
+ "min": false,
+ "show": true,
+ "total": false,
+ "values": false
+ },
+ "lines": true,
+ "linewidth": 1,
+ "links": [],
+ "nullPointMode": "null",
+ "percentage": false,
+ "pointradius": 5,
+ "points": false,
+ "renderer": "flot",
+ "seriesOverrides": [],
+ "spaceLength": 10,
+ "span": 6,
+ "stack": false,
+ "steppedLine": false,
+ "targets": [
+ {
+ "expr": "delta(consul_raft_apply[30s])",
+ "format": "time_series",
+ "interval": "",
+ "intervalFactor": 2,
+ "legendFormat": "Transactions",
+ "refId": "A",
+ "step": 10
+ }
+ ],
+ "thresholds": [],
+ "timeFrom": null,
+ "timeShift": null,
+ "title": "Raft Transactions",
+ "tooltip": {
+ "shared": true,
+ "sort": 0,
+ "value_type": "individual"
+ },
+ "type": "graph",
+ "xaxis": {
+ "buckets": null,
+ "mode": "time",
+ "name": null,
+ "show": true,
+ "values": []
+ },
+ "yaxes": [
+ {
+ "format": "ops",
+ "label": null,
+ "logBase": 1,
+ "max": null,
+ "min": "0",
+ "show": true
+ },
+ {
+ "format": "short",
+ "label": null,
+ "logBase": 1,
+ "max": null,
+ "min": null,
+ "show": true
+ }
+ ]
+ },
+ {
+ "aliasColors": {},
+ "bars": false,
+ "dashLength": 10,
+ "dashes": false,
+ "datasource": "${DS_PROMETHEUS}",
+ "description": "This will only be emitted by the Raft leader and measures the time since the leader was last able to contact the follower nodes when checking its leader lease. It can be used as a measure for how stable the Raft timing is and how close the leader is to timing out its lease.\n\nThe lease timeout is 500 ms times the raft_multiplier configuration, so this telemetry value should not be getting close to that configured value, otherwise the Raft timing is marginal and might need to be tuned, or more powerful servers might be needed. See the Server Performance guide for more details.",
+ "fill": 1,
+ "id": 10,
+ "legend": {
+ "avg": false,
+ "current": false,
+ "max": false,
+ "min": false,
+ "show": true,
+ "total": false,
+ "values": false
+ },
+ "lines": true,
+ "linewidth": 1,
+ "links": [],
+ "nullPointMode": "null",
+ "percentage": false,
+ "pointradius": 5,
+ "points": false,
+ "renderer": "flot",
+ "seriesOverrides": [],
+ "spaceLength": 10,
+ "span": 6,
+ "stack": false,
+ "steppedLine": false,
+ "targets": [
+ {
+ "expr": "consul_raft_leader_lastcontact",
+ "format": "time_series",
+ "intervalFactor": 2,
+ "legendFormat": "{{quantile}}%",
+ "refId": "A",
+ "step": 10
+ }
+ ],
+ "thresholds": [],
+ "timeFrom": null,
+ "timeShift": null,
+ "title": "Leader lastContact",
+ "tooltip": {
+ "shared": true,
+ "sort": 0,
+ "value_type": "individual"
+ },
+ "type": "graph",
+ "xaxis": {
+ "buckets": null,
+ "mode": "time",
+ "name": null,
+ "show": true,
+ "values": []
+ },
+ "yaxes": [
+ {
+ "format": "ms",
+ "label": null,
+ "logBase": 1,
+ "max": null,
+ "min": "0",
+ "show": true
+ },
+ {
+ "format": "short",
+ "label": null,
+ "logBase": 1,
+ "max": null,
+ "min": null,
+ "show": true
+ }
+ ]
+ }
+ ],
+ "repeat": null,
+ "repeatIteration": null,
+ "repeatRowId": null,
+ "showTitle": false,
+ "title": "Dashboard Row",
+ "titleSize": "h6"
+ },
+ {
+ "collapse": false,
+ "height": 250,
+ "panels": [
+ {
+ "aliasColors": {},
+ "bars": false,
+ "dashLength": 10,
+ "dashes": false,
+ "datasource": "${DS_PROMETHEUS}",
+ "fill": 1,
+ "id": 12,
+ "legend": {
+ "avg": false,
+ "current": false,
+ "max": false,
+ "min": false,
+ "show": true,
+ "total": false,
+ "values": false
+ },
+ "lines": true,
+ "linewidth": 1,
+ "links": [],
+ "nullPointMode": "null",
+ "percentage": false,
+ "pointradius": 5,
+ "points": false,
+ "renderer": "flot",
+ "seriesOverrides": [],
+ "spaceLength": 10,
+ "span": 6,
+ "stack": false,
+ "steppedLine": false,
+ "targets": [
+ {
+ "expr": "delta(consul_rpc_query{host=\"$consul\"}[30s])",
+ "format": "time_series",
+ "intervalFactor": 2,
+ "legendFormat": "Requests",
+ "refId": "A",
+ "step": 10
+ }
+ ],
+ "thresholds": [],
+ "timeFrom": null,
+ "timeShift": null,
+ "title": "RPC Requests",
+ "tooltip": {
+ "shared": true,
+ "sort": 0,
+ "value_type": "individual"
+ },
+ "type": "graph",
+ "xaxis": {
+ "buckets": null,
+ "mode": "time",
+ "name": null,
+ "show": true,
+ "values": []
+ },
+ "yaxes": [
+ {
+ "format": "short",
+ "label": null,
+ "logBase": 1,
+ "max": null,
+ "min": null,
+ "show": true
+ },
+ {
+ "format": "short",
+ "label": null,
+ "logBase": 1,
+ "max": null,
+ "min": null,
+ "show": true
+ }
+ ]
+ },
+ {
+ "aliasColors": {},
+ "bars": false,
+ "dashLength": 10,
+ "dashes": false,
+ "datasource": "${DS_PROMETHEUS}",
+ "description": "Consul uses a network tomography system to compute network coordinates for nodes in the cluster. These coordinates allow the network round trip time to be estimated between any two nodes using a very simple calculation. This allows for many useful applications, such as finding the service node nearest a requesting node, or failing over to services in the next closest datacenter.",
+ "fill": 1,
+ "id": 13,
+ "legend": {
+ "avg": false,
+ "current": false,
+ "max": false,
+ "min": false,
+ "show": true,
+ "total": false,
+ "values": false
+ },
+ "lines": true,
+ "linewidth": 1,
+ "links": [],
+ "nullPointMode": "null",
+ "percentage": false,
+ "pointradius": 5,
+ "points": false,
+ "renderer": "flot",
+ "seriesOverrides": [],
+ "spaceLength": 10,
+ "span": 6,
+ "stack": false,
+ "steppedLine": false,
+ "targets": [
+ {
+ "expr": "consul_serf_coordinate_adjustment_ms{host=\"$consul\"}",
+ "format": "time_series",
+ "interval": "",
+ "intervalFactor": 2,
+ "legendFormat": "{{quantile}}%",
+ "refId": "A",
+ "step": 10
+ }
+ ],
+ "thresholds": [],
+ "timeFrom": null,
+ "timeShift": null,
+ "title": "Serf Coordinates",
+ "tooltip": {
+ "shared": true,
+ "sort": 0,
+ "value_type": "individual"
+ },
+ "type": "graph",
+ "xaxis": {
+ "buckets": null,
+ "mode": "time",
+ "name": null,
+ "show": true,
+ "values": []
+ },
+ "yaxes": [
+ {
+ "format": "ms",
+ "label": null,
+ "logBase": 1,
+ "max": null,
+ "min": null,
+ "show": true
+ },
+ {
+ "format": "short",
+ "label": null,
+ "logBase": 1,
+ "max": null,
+ "min": null,
+ "show": true
+ }
+ ]
+ }
+ ],
+ "repeat": null,
+ "repeatIteration": null,
+ "repeatRowId": null,
+ "showTitle": false,
+ "title": "Dashboard Row",
+ "titleSize": "h6"
+ }
+ ],
+ "schemaVersion": 14,
+ "style": "dark",
+ "tags": [],
+ "templating": {
+ "list": [
+ {
+ "current": {
+ "selected": false,
+ "text": "default",
+ "value": "default"
+ },
+ "hide": 0,
+ "includeAll": false,
+ "label": "datasource",
+ "multi": false,
+ "name": "DS_PROMETHEUS",
+ "options": [],
+ "query": "prometheus",
+ "refresh": 1,
+ "regex": "",
+ "skipUrlSync": false,
+ "type": "datasource"
+ },
+ {
+ "allValue": null,
+ "current": {},
+ "datasource": "${DS_PROMETHEUS}",
+ "hide": 0,
+ "includeAll": false,
+ "label": null,
+ "multi": false,
+ "name": "consul",
+ "options": [],
+ "query": "label_values(consul_memberlist_gossep_sum, host)",
+ "refresh": 1,
+ "regex": "",
+ "sort": 0,
+ "tagValuesQuery": "",
+ "tags": [],
+ "tagsQuery": "",
+ "type": "query",
+ "useTags": false
+ }
+ ]
+ },
+ "time": {
+ "from": "now-4h",
+ "to": "now"
+ },
+ "timepicker": {
+ "refresh_intervals": [
+ "5s",
+ "10s",
+ "30s",
+ "1m",
+ "5m",
+ "15m",
+ "30m",
+ "1h",
+ "2h",
+ "1d"
+ ],
+ "time_options": [
+ "5m",
+ "15m",
+ "1h",
+ "6h",
+ "12h",
+ "24h",
+ "2d",
+ "7d",
+ "30d"
+ ]
+ },
+ "timezone": "browser",
+ "title": "Consul",
+ "version": 1
+} \ No newline at end of file
diff --git a/terraform-ci-infra/1n_nmd/grafana/conf/docker_cadvisor.json b/terraform-ci-infra/1n_nmd/grafana/conf/docker_cadvisor.json
new file mode 100644
index 0000000000..bbad614bb4
--- /dev/null
+++ b/terraform-ci-infra/1n_nmd/grafana/conf/docker_cadvisor.json
@@ -0,0 +1,2040 @@
+{
+ "__inputs": [
+ {
+ "name": "DS_PROMETHEUS",
+ "label": "Prometheus",
+ "description": "",
+ "type": "datasource",
+ "pluginId": "prometheus",
+ "pluginName": "Prometheus"
+ }
+ ],
+ "__requires": [
+ {
+ "type": "grafana",
+ "id": "grafana",
+ "name": "Grafana",
+ "version": "6.2.4"
+ },
+ {
+ "type": "panel",
+ "id": "graph",
+ "name": "Graph",
+ "version": ""
+ },
+ {
+ "type": "datasource",
+ "id": "prometheus",
+ "name": "Prometheus",
+ "version": "1.0.0"
+ },
+ {
+ "type": "panel",
+ "id": "singlestat",
+ "name": "Singlestat",
+ "version": ""
+ },
+ {
+ "type": "panel",
+ "id": "table",
+ "name": "Table",
+ "version": ""
+ }
+ ],
+ "annotations": {
+ "list": [
+ {
+ "builtIn": 1,
+ "datasource": "-- Grafana --",
+ "enable": true,
+ "hide": true,
+ "iconColor": "rgba(0, 211, 255, 1)",
+ "name": "Annotations & Alerts",
+ "type": "dashboard"
+ }
+ ]
+ },
+ "description": "A simple overview of the most important Docker host and container metrics. (cAdvisor/Prometheus)",
+ "editable": true,
+ "gnetId": 10657,
+ "graphTooltip": 1,
+ "id": null,
+ "iteration": 1564715574785,
+ "links": [],
+ "panels": [
+ {
+ "cacheTimeout": null,
+ "colorBackground": false,
+ "colorValue": false,
+ "colors": [
+ "rgba(245, 54, 54, 0.9)",
+ "rgba(237, 129, 40, 0.89)",
+ "rgba(50, 172, 45, 0.97)"
+ ],
+ "datasource": "${DS_PROMETHEUS}",
+ "decimals": 0,
+ "editable": true,
+ "error": false,
+ "format": "s",
+ "gauge": {
+ "maxValue": 100,
+ "minValue": 0,
+ "show": false,
+ "thresholdLabels": false,
+ "thresholdMarkers": true
+ },
+ "gridPos": {
+ "h": 4,
+ "w": 4,
+ "x": 0,
+ "y": 0
+ },
+ "height": "",
+ "id": 24,
+ "interval": null,
+ "links": [],
+ "mappingType": 1,
+ "mappingTypes": [
+ {
+ "name": "value to text",
+ "value": 1
+ },
+ {
+ "name": "range to text",
+ "value": 2
+ }
+ ],
+ "maxDataPoints": 100,
+ "nullPointMode": "connected",
+ "nullText": null,
+ "options": {},
+ "postfix": "",
+ "postfixFontSize": "30%",
+ "prefix": "",
+ "prefixFontSize": "20%",
+ "rangeMaps": [
+ {
+ "from": "null",
+ "text": "N/A",
+ "to": "null"
+ }
+ ],
+ "sparkline": {
+ "fillColor": "rgba(31, 118, 189, 0.18)",
+ "full": false,
+ "lineColor": "rgb(31, 120, 193)",
+ "show": false
+ },
+ "tableColumn": "",
+ "targets": [
+ {
+ "expr": "time() - node_boot_time_seconds{instance=~\"$node:.*\"}",
+ "format": "time_series",
+ "hide": false,
+ "intervalFactor": 2,
+ "legendFormat": "",
+ "refId": "A",
+ "step": 1800
+ }
+ ],
+ "thresholds": "",
+ "title": "Uptime",
+ "type": "singlestat",
+ "valueFontSize": "80%",
+ "valueMaps": [
+ {
+ "op": "=",
+ "text": "N/A",
+ "value": "null"
+ }
+ ],
+ "valueName": "current"
+ },
+ {
+ "cacheTimeout": null,
+ "colorBackground": false,
+ "colorValue": false,
+ "colors": [
+ "rgba(245, 54, 54, 0.9)",
+ "rgba(237, 129, 40, 0.89)",
+ "rgba(50, 172, 45, 0.97)"
+ ],
+ "datasource": "${DS_PROMETHEUS}",
+ "editable": true,
+ "error": false,
+ "format": "none",
+ "gauge": {
+ "maxValue": 100,
+ "minValue": 0,
+ "show": false,
+ "thresholdLabels": false,
+ "thresholdMarkers": true
+ },
+ "gridPos": {
+ "h": 4,
+ "w": 4,
+ "x": 4,
+ "y": 0
+ },
+ "id": 31,
+ "interval": null,
+ "links": [],
+ "mappingType": 1,
+ "mappingTypes": [
+ {
+ "name": "value to text",
+ "value": 1
+ },
+ {
+ "name": "range to text",
+ "value": 2
+ }
+ ],
+ "maxDataPoints": 100,
+ "nullPointMode": "connected",
+ "nullText": null,
+ "options": {},
+ "postfix": "",
+ "postfixFontSize": "50%",
+ "prefix": "",
+ "prefixFontSize": "50%",
+ "rangeMaps": [
+ {
+ "from": "null",
+ "text": "N/A",
+ "to": "null"
+ }
+ ],
+ "sparkline": {
+ "fillColor": "rgba(31, 118, 189, 0.18)",
+ "full": false,
+ "lineColor": "rgb(31, 120, 193)",
+ "show": false
+ },
+ "tableColumn": "",
+ "targets": [
+ {
+ "expr": "count(container_last_seen{instance=~\"$node:$port\",job=~\"$job\",image!=\"\"})",
+ "format": "time_series",
+ "intervalFactor": 2,
+ "refId": "A",
+ "step": 1800
+ }
+ ],
+ "thresholds": "",
+ "title": "Containers",
+ "type": "singlestat",
+ "valueFontSize": "120%",
+ "valueMaps": [
+ {
+ "op": "=",
+ "text": "N/A",
+ "value": "null"
+ }
+ ],
+ "valueName": "current"
+ },
+ {
+ "cacheTimeout": null,
+ "colorBackground": false,
+ "colorValue": false,
+ "colors": [
+ "rgba(50, 172, 45, 0.97)",
+ "rgba(237, 129, 40, 0.89)",
+ "rgba(245, 54, 54, 0.9)"
+ ],
+ "datasource": "${DS_PROMETHEUS}",
+ "decimals": 0,
+ "editable": true,
+ "error": false,
+ "format": "decbytes",
+ "gauge": {
+ "maxValue": 500000000,
+ "minValue": 0,
+ "show": true,
+ "thresholdLabels": false,
+ "thresholdMarkers": true
+ },
+ "gridPos": {
+ "h": 4,
+ "w": 4,
+ "x": 8,
+ "y": 0
+ },
+ "id": 30,
+ "interval": null,
+ "links": [],
+ "mappingType": 1,
+ "mappingTypes": [
+ {
+ "name": "value to text",
+ "value": 1
+ },
+ {
+ "name": "range to text",
+ "value": 2
+ }
+ ],
+ "maxDataPoints": 100,
+ "nullPointMode": "connected",
+ "nullText": null,
+ "options": {},
+ "postfix": "",
+ "postfixFontSize": "50%",
+ "prefix": "",
+ "prefixFontSize": "50%",
+ "rangeMaps": [
+ {
+ "from": "null",
+ "text": "N/A",
+ "to": "null"
+ }
+ ],
+ "sparkline": {
+ "fillColor": "rgba(31, 118, 189, 0.18)",
+ "full": false,
+ "lineColor": "rgb(31, 120, 193)",
+ "show": false
+ },
+ "tableColumn": "",
+ "targets": [
+ {
+ "expr": "(node_memory_SwapTotal_bytes{instance=~'$node:9100'} - node_memory_SwapFree_bytes{instance=~'$node:9100'})",
+ "format": "time_series",
+ "intervalFactor": 2,
+ "legendFormat": "",
+ "refId": "A",
+ "step": 1800
+ }
+ ],
+ "thresholds": "400000000",
+ "title": "Swap",
+ "type": "singlestat",
+ "valueFontSize": "80%",
+ "valueMaps": [
+ {
+ "op": "=",
+ "text": "N/A",
+ "value": "null"
+ }
+ ],
+ "valueName": "current"
+ },
+ {
+ "cacheTimeout": null,
+ "colorBackground": false,
+ "colorValue": false,
+ "colors": [
+ "rgba(245, 54, 54, 0.9)",
+ "rgba(237, 129, 40, 0.89)",
+ "rgba(50, 172, 45, 0.97)"
+ ],
+ "datasource": "${DS_PROMETHEUS}",
+ "decimals": 0,
+ "editable": true,
+ "error": false,
+ "format": "percentunit",
+ "gauge": {
+ "maxValue": 100,
+ "minValue": 0,
+ "show": false,
+ "thresholdLabels": false,
+ "thresholdMarkers": true
+ },
+ "gridPos": {
+ "h": 4,
+ "w": 4,
+ "x": 12,
+ "y": 0
+ },
+ "id": 27,
+ "interval": null,
+ "links": [],
+ "mappingType": 1,
+ "mappingTypes": [
+ {
+ "name": "value to text",
+ "value": 1
+ },
+ {
+ "name": "range to text",
+ "value": 2
+ }
+ ],
+ "maxDataPoints": 100,
+ "nullPointMode": "connected",
+ "nullText": null,
+ "options": {},
+ "postfix": "",
+ "postfixFontSize": "50%",
+ "prefix": "",
+ "prefixFontSize": "50%",
+ "rangeMaps": [
+ {
+ "from": "null",
+ "text": "N/A",
+ "to": "null"
+ }
+ ],
+ "sparkline": {
+ "fillColor": "rgba(50, 189, 31, 0.18)",
+ "full": false,
+ "lineColor": "rgb(69, 193, 31)",
+ "show": true
+ },
+ "tableColumn": "",
+ "targets": [
+ {
+ "expr": "node_load1{instance=~\"$node:9100\"} / count by(job, instance)(count by(job, instance, cpu)(node_cpu_seconds_total{instance=~\"$node:9100\"}))",
+ "format": "time_series",
+ "intervalFactor": 2,
+ "refId": "A",
+ "step": 1800
+ }
+ ],
+ "thresholds": "0.8,0.9",
+ "title": "Load",
+ "type": "singlestat",
+ "valueFontSize": "80%",
+ "valueMaps": [
+ {
+ "op": "=",
+ "text": "N/A",
+ "value": "null"
+ }
+ ],
+ "valueName": "avg"
+ },
+ {
+ "alert": {
+ "conditions": [
+ {
+ "evaluator": {
+ "params": [
+ 10000000000
+ ],
+ "type": "gt"
+ },
+ "query": {
+ "params": [
+ "A",
+ "5m",
+ "now"
+ ]
+ },
+ "reducer": {
+ "params": [],
+ "type": "avg"
+ },
+ "type": "query"
+ }
+ ],
+ "executionErrorState": "alerting",
+ "frequency": "60s",
+ "handler": 1,
+ "name": "Available Memory alert",
+ "noDataState": "keep_state",
+ "notifications": [
+ {
+ "id": 1
+ }
+ ]
+ },
+ "aliasColors": {
+ "Available Memory": "#7EB26D",
+ "Unavailable Memory": "#7EB26D"
+ },
+ "bars": false,
+ "dashLength": 10,
+ "dashes": false,
+ "datasource": "${DS_PROMETHEUS}",
+ "editable": true,
+ "error": false,
+ "fill": 1,
+ "grid": {},
+ "gridPos": {
+ "h": 10,
+ "w": 4,
+ "x": 16,
+ "y": 0
+ },
+ "id": 20,
+ "legend": {
+ "avg": false,
+ "current": false,
+ "max": false,
+ "min": false,
+ "show": false,
+ "total": false,
+ "values": false
+ },
+ "lines": true,
+ "linewidth": 1,
+ "links": [],
+ "nullPointMode": "null as zero",
+ "options": {},
+ "percentage": false,
+ "pointradius": 5,
+ "points": false,
+ "renderer": "flot",
+ "seriesOverrides": [],
+ "spaceLength": 10,
+ "stack": true,
+ "steppedLine": false,
+ "targets": [
+ {
+ "expr": "container_memory_rss{name=~\".+\"}",
+ "format": "time_series",
+ "hide": true,
+ "intervalFactor": 2,
+ "legendFormat": "{{__name__}}",
+ "refId": "D",
+ "step": 20
+ },
+ {
+ "expr": "sum(container_memory_rss{name=~\".+\"})",
+ "format": "time_series",
+ "hide": true,
+ "intervalFactor": 2,
+ "legendFormat": "{{__name__}}",
+ "refId": "A",
+ "step": 20
+ },
+ {
+ "expr": "container_memory_usage_bytes{name=~\".+\"}",
+ "format": "time_series",
+ "hide": true,
+ "intervalFactor": 2,
+ "legendFormat": "{{name}}",
+ "refId": "B",
+ "step": 20
+ },
+ {
+ "expr": "container_memory_rss{id=\"/\"}",
+ "format": "time_series",
+ "hide": true,
+ "intervalFactor": 2,
+ "legendFormat": "{{__name__}}",
+ "refId": "C",
+ "step": 20
+ },
+ {
+ "expr": "sum(container_memory_rss)",
+ "format": "time_series",
+ "hide": true,
+ "intervalFactor": 2,
+ "legendFormat": "{{__name__}}",
+ "refId": "E",
+ "step": 20
+ },
+ {
+ "expr": "node_memory_Buffers",
+ "format": "time_series",
+ "hide": true,
+ "intervalFactor": 2,
+ "legendFormat": "node_memory_Dirty",
+ "refId": "N",
+ "step": 30
+ },
+ {
+ "expr": "node_memory_MemFree",
+ "format": "time_series",
+ "hide": true,
+ "intervalFactor": 2,
+ "legendFormat": "{{__name__}}",
+ "refId": "F",
+ "step": 20
+ },
+ {
+ "expr": "node_memory_MemAvailable",
+ "format": "time_series",
+ "hide": true,
+ "intervalFactor": 2,
+ "legendFormat": "Available Memory",
+ "refId": "H",
+ "step": 20
+ },
+ {
+ "expr": "node_memory_MemTotal_bytes{instance=~\"$node:9100\"} - node_memory_MemAvailable_bytes{instance=~\"$node:9100\"}",
+ "format": "time_series",
+ "hide": false,
+ "intervalFactor": 2,
+ "legendFormat": "Unavailable Memory",
+ "refId": "G",
+ "step": 600
+ },
+ {
+ "expr": "node_memory_Inactive",
+ "format": "time_series",
+ "hide": true,
+ "intervalFactor": 2,
+ "legendFormat": "{{__name__}}",
+ "refId": "I",
+ "step": 30
+ },
+ {
+ "expr": "node_memory_KernelStack",
+ "format": "time_series",
+ "hide": true,
+ "intervalFactor": 2,
+ "legendFormat": "{{__name__}}",
+ "refId": "J",
+ "step": 30
+ },
+ {
+ "expr": "node_memory_Active",
+ "format": "time_series",
+ "hide": true,
+ "intervalFactor": 2,
+ "legendFormat": "{{__name__}}",
+ "refId": "K",
+ "step": 30
+ },
+ {
+ "expr": "node_memory_MemTotal - (node_memory_Active + node_memory_MemFree + node_memory_Inactive)",
+ "format": "time_series",
+ "hide": true,
+ "intervalFactor": 2,
+ "legendFormat": "Unknown",
+ "refId": "L",
+ "step": 40
+ },
+ {
+ "expr": "node_memory_MemFree + node_memory_Inactive ",
+ "format": "time_series",
+ "hide": true,
+ "intervalFactor": 2,
+ "legendFormat": "{{__name__}}",
+ "refId": "M",
+ "step": 30
+ },
+ {
+ "expr": "container_memory_rss{name=~\".+\"}",
+ "format": "time_series",
+ "hide": true,
+ "intervalFactor": 2,
+ "legendFormat": "{{__name__}}",
+ "refId": "O",
+ "step": 30
+ },
+ {
+ "expr": "node_memory_Inactive + node_memory_MemFree + node_memory_MemAvailable",
+ "format": "time_series",
+ "hide": true,
+ "intervalFactor": 2,
+ "legendFormat": "",
+ "refId": "P",
+ "step": 40
+ }
+ ],
+ "thresholds": [
+ {
+ "colorMode": "critical",
+ "fill": true,
+ "line": true,
+ "op": "gt",
+ "value": 10000000000,
+ "yaxis": "left"
+ }
+ ],
+ "timeFrom": null,
+ "timeRegions": [],
+ "timeShift": null,
+ "title": "Available Memory",
+ "tooltip": {
+ "msResolution": true,
+ "shared": true,
+ "sort": 0,
+ "value_type": "individual"
+ },
+ "type": "graph",
+ "xaxis": {
+ "buckets": null,
+ "mode": "time",
+ "name": null,
+ "show": false,
+ "values": []
+ },
+ "yaxes": [
+ {
+ "format": "bytes",
+ "label": "",
+ "logBase": 1,
+ "max": 16000000000,
+ "min": 0,
+ "show": true
+ },
+ {
+ "format": "short",
+ "label": null,
+ "logBase": 1,
+ "max": null,
+ "min": null,
+ "show": false
+ }
+ ],
+ "yaxis": {
+ "align": false,
+ "alignLevel": null
+ }
+ },
+ {
+ "alert": {
+ "conditions": [
+ {
+ "evaluator": {
+ "params": [
+ 850000000000
+ ],
+ "type": "gt"
+ },
+ "query": {
+ "params": [
+ "A",
+ "5m",
+ "now"
+ ]
+ },
+ "reducer": {
+ "params": [],
+ "type": "avg"
+ },
+ "type": "query"
+ }
+ ],
+ "executionErrorState": "alerting",
+ "frequency": "60s",
+ "handler": 1,
+ "name": "Free/Used Disk Space alert",
+ "noDataState": "keep_state",
+ "notifications": [
+ {
+ "id": 1
+ }
+ ]
+ },
+ "aliasColors": {
+ "Belegete Festplatte": "#BF1B00",
+ "Free Disk Space": "#7EB26D",
+ "Used Disk Space": "#7EB26D",
+ "{}": "#BF1B00"
+ },
+ "bars": false,
+ "dashLength": 10,
+ "dashes": false,
+ "datasource": "${DS_PROMETHEUS}",
+ "editable": true,
+ "error": false,
+ "fill": 1,
+ "grid": {},
+ "gridPos": {
+ "h": 10,
+ "w": 4,
+ "x": 20,
+ "y": 0
+ },
+ "id": 13,
+ "legend": {
+ "avg": false,
+ "current": false,
+ "max": false,
+ "min": false,
+ "show": false,
+ "total": false,
+ "values": false
+ },
+ "lines": true,
+ "linewidth": 1,
+ "links": [],
+ "nullPointMode": "null as zero",
+ "options": {},
+ "percentage": false,
+ "pointradius": 5,
+ "points": false,
+ "renderer": "flot",
+ "seriesOverrides": [
+ {
+ "alias": "Used Disk Space",
+ "yaxis": 1
+ }
+ ],
+ "spaceLength": 10,
+ "stack": true,
+ "steppedLine": false,
+ "targets": [
+ {
+ "expr": "node_filesystem_size_bytes{fstype=\"rootfs\"} - node_filesystem_free_bytes{fstype=\"rootfs\"}",
+ "format": "time_series",
+ "hide": false,
+ "intervalFactor": 2,
+ "legendFormat": "Used Disk Space",
+ "refId": "A",
+ "step": 600
+ }
+ ],
+ "thresholds": [
+ {
+ "colorMode": "critical",
+ "fill": true,
+ "line": true,
+ "op": "gt",
+ "value": 850000000000
+ }
+ ],
+ "timeFrom": null,
+ "timeRegions": [],
+ "timeShift": null,
+ "title": "Used Disk Space",
+ "tooltip": {
+ "msResolution": true,
+ "shared": true,
+ "sort": 0,
+ "value_type": "individual"
+ },
+ "type": "graph",
+ "xaxis": {
+ "buckets": null,
+ "mode": "time",
+ "name": null,
+ "show": false,
+ "values": []
+ },
+ "yaxes": [
+ {
+ "format": "bytes",
+ "label": "",
+ "logBase": 1,
+ "max": 1000000000000,
+ "min": 0,
+ "show": true
+ },
+ {
+ "format": "short",
+ "label": null,
+ "logBase": 1,
+ "max": null,
+ "min": null,
+ "show": false
+ }
+ ],
+ "yaxis": {
+ "align": false,
+ "alignLevel": null
+ }
+ },
+ {
+ "aliasColors": {
+ "SENT": "#BF1B00"
+ },
+ "bars": false,
+ "dashLength": 10,
+ "dashes": false,
+ "datasource": "${DS_PROMETHEUS}",
+ "editable": true,
+ "error": false,
+ "fill": 1,
+ "grid": {},
+ "gridPos": {
+ "h": 6,
+ "w": 4,
+ "x": 0,
+ "y": 4
+ },
+ "id": 19,
+ "legend": {
+ "avg": false,
+ "current": false,
+ "max": false,
+ "min": false,
+ "show": false,
+ "total": false,
+ "values": false
+ },
+ "lines": true,
+ "linewidth": 1,
+ "links": [],
+ "nullPointMode": "null as zero",
+ "options": {},
+ "percentage": false,
+ "pointradius": 1,
+ "points": false,
+ "renderer": "flot",
+ "seriesOverrides": [],
+ "spaceLength": 10,
+ "stack": false,
+ "steppedLine": false,
+ "targets": [
+ {
+ "expr": "sum(rate(container_network_receive_bytes_total{id=\"/\"}[$interval])) by (id)",
+ "format": "time_series",
+ "intervalFactor": 2,
+ "legendFormat": "RECEIVED",
+ "refId": "A",
+ "step": 600
+ },
+ {
+ "expr": "- sum(rate(container_network_transmit_bytes_total{id=\"/\"}[$interval])) by (id)",
+ "format": "time_series",
+ "hide": false,
+ "intervalFactor": 2,
+ "legendFormat": "SENT",
+ "refId": "B",
+ "step": 600
+ }
+ ],
+ "thresholds": [],
+ "timeFrom": null,
+ "timeRegions": [],
+ "timeShift": null,
+ "title": "Network Traffic",
+ "tooltip": {
+ "msResolution": true,
+ "shared": true,
+ "sort": 0,
+ "value_type": "cumulative"
+ },
+ "type": "graph",
+ "xaxis": {
+ "buckets": null,
+ "mode": "time",
+ "name": null,
+ "show": false,
+ "values": []
+ },
+ "yaxes": [
+ {
+ "format": "bytes",
+ "label": null,
+ "logBase": 1,
+ "max": null,
+ "min": null,
+ "show": true
+ },
+ {
+ "format": "short",
+ "label": null,
+ "logBase": 1,
+ "max": null,
+ "min": null,
+ "show": false
+ }
+ ],
+ "yaxis": {
+ "align": false,
+ "alignLevel": null
+ }
+ },
+ {
+ "cacheTimeout": null,
+ "colorBackground": false,
+ "colorValue": false,
+ "colors": [
+ "rgba(50, 172, 45, 0.97)",
+ "rgba(237, 129, 40, 0.89)",
+ "rgba(245, 54, 54, 0.9)"
+ ],
+ "datasource": "${DS_PROMETHEUS}",
+ "decimals": 0,
+ "editable": true,
+ "error": false,
+ "format": "percent",
+ "gauge": {
+ "maxValue": 100,
+ "minValue": 0,
+ "show": true,
+ "thresholdLabels": false,
+ "thresholdMarkers": true
+ },
+ "gridPos": {
+ "h": 6,
+ "w": 4,
+ "x": 4,
+ "y": 4
+ },
+ "id": 25,
+ "interval": null,
+ "links": [],
+ "mappingType": 1,
+ "mappingTypes": [
+ {
+ "name": "value to text",
+ "value": 1
+ },
+ {
+ "name": "range to text",
+ "value": 2
+ }
+ ],
+ "maxDataPoints": 100,
+ "nullPointMode": "connected",
+ "nullText": null,
+ "options": {},
+ "postfix": "",
+ "postfixFontSize": "50%",
+ "prefix": "",
+ "prefixFontSize": "50%",
+ "rangeMaps": [
+ {
+ "from": "null",
+ "text": "N/A",
+ "to": "null"
+ }
+ ],
+ "sparkline": {
+ "fillColor": "rgba(31, 118, 189, 0.18)",
+ "full": false,
+ "lineColor": "rgb(31, 120, 193)",
+ "show": false
+ },
+ "tableColumn": "",
+ "targets": [
+ {
+ "expr": "((node_memory_MemTotal_bytes{instance=~\"$node:9100\"} - node_memory_MemAvailable_bytes{instance=~\"$node:9100\"}) / node_memory_MemTotal_bytes{instance=~\"$node:9100\"}) * 100",
+ "format": "time_series",
+ "intervalFactor": 2,
+ "refId": "A",
+ "step": 1800
+ }
+ ],
+ "thresholds": "70, 90",
+ "title": "Memory",
+ "type": "singlestat",
+ "valueFontSize": "80%",
+ "valueMaps": [
+ {
+ "op": "=",
+ "text": "N/A",
+ "value": "null"
+ }
+ ],
+ "valueName": "current"
+ },
+ {
+ "aliasColors": {
+ "{id=\"/\",instance=\"cadvisor:8080\",job=\"prometheus\"}": "#BA43A9"
+ },
+ "bars": false,
+ "dashLength": 10,
+ "dashes": false,
+ "datasource": "${DS_PROMETHEUS}",
+ "editable": true,
+ "error": false,
+ "fill": 1,
+ "grid": {},
+ "gridPos": {
+ "h": 6,
+ "w": 4,
+ "x": 8,
+ "y": 4
+ },
+ "id": 5,
+ "legend": {
+ "avg": false,
+ "current": false,
+ "max": false,
+ "min": false,
+ "show": false,
+ "total": false,
+ "values": false
+ },
+ "lines": true,
+ "linewidth": 1,
+ "links": [],
+ "nullPointMode": "null as zero",
+ "options": {},
+ "percentage": false,
+ "pointradius": 5,
+ "points": false,
+ "renderer": "flot",
+ "seriesOverrides": [],
+ "spaceLength": 10,
+ "stack": true,
+ "steppedLine": false,
+ "targets": [
+ {
+ "expr": "sum(rate(container_cpu_system_seconds_total[1m]))",
+ "format": "time_series",
+ "hide": true,
+ "intervalFactor": 2,
+ "legendFormat": "a",
+ "refId": "B",
+ "step": 120
+ },
+ {
+ "expr": "sum(rate(container_cpu_system_seconds_total{name=~\".+\"}[1m]))",
+ "format": "time_series",
+ "hide": true,
+ "interval": "",
+ "intervalFactor": 2,
+ "legendFormat": "nur container",
+ "refId": "F",
+ "step": 10
+ },
+ {
+ "expr": "sum(rate(container_cpu_system_seconds_total{id=\"/\"}[1m]))",
+ "format": "time_series",
+ "hide": true,
+ "interval": "",
+ "intervalFactor": 2,
+ "legendFormat": "nur docker host",
+ "metric": "",
+ "refId": "A",
+ "step": 20
+ },
+ {
+ "expr": "sum(rate(process_cpu_seconds_total[$interval])) * 100",
+ "format": "time_series",
+ "hide": false,
+ "interval": "",
+ "intervalFactor": 2,
+ "legendFormat": "host",
+ "metric": "",
+ "refId": "C",
+ "step": 600
+ },
+ {
+ "expr": "sum(rate(container_cpu_system_seconds_total{name=~\".+\"}[1m])) + sum(rate(container_cpu_system_seconds_total{id=\"/\"}[1m])) + sum(rate(process_cpu_seconds_total[1m]))",
+ "format": "time_series",
+ "hide": true,
+ "intervalFactor": 2,
+ "legendFormat": "",
+ "refId": "D",
+ "step": 120
+ }
+ ],
+ "thresholds": [],
+ "timeFrom": null,
+ "timeRegions": [],
+ "timeShift": null,
+ "title": "CPU Usage",
+ "tooltip": {
+ "msResolution": true,
+ "shared": true,
+ "sort": 0,
+ "value_type": "cumulative"
+ },
+ "type": "graph",
+ "xaxis": {
+ "buckets": null,
+ "mode": "time",
+ "name": null,
+ "show": false,
+ "values": []
+ },
+ "yaxes": [
+ {
+ "format": "percent",
+ "label": "",
+ "logBase": 1,
+ "max": null,
+ "min": null,
+ "show": true
+ },
+ {
+ "format": "short",
+ "label": null,
+ "logBase": 1,
+ "max": null,
+ "min": null,
+ "show": false
+ }
+ ],
+ "yaxis": {
+ "align": false,
+ "alignLevel": null
+ }
+ },
+ {
+ "aliasColors": {
+ "IN on /sda": "#7EB26D",
+ "OUT on /sda": "#890F02"
+ },
+ "bars": false,
+ "dashLength": 10,
+ "dashes": false,
+ "datasource": "${DS_PROMETHEUS}",
+ "editable": true,
+ "error": false,
+ "fill": 1,
+ "grid": {},
+ "gridPos": {
+ "h": 6,
+ "w": 4,
+ "x": 12,
+ "y": 4
+ },
+ "id": 3,
+ "legend": {
+ "avg": false,
+ "current": false,
+ "max": false,
+ "min": false,
+ "show": false,
+ "total": false,
+ "values": false
+ },
+ "lines": true,
+ "linewidth": 1,
+ "links": [],
+ "nullPointMode": "null as zero",
+ "options": {},
+ "percentage": false,
+ "pointradius": 5,
+ "points": false,
+ "renderer": "flot",
+ "seriesOverrides": [],
+ "spaceLength": 10,
+ "stack": false,
+ "steppedLine": false,
+ "targets": [
+ {
+ "expr": "-sum(rate(node_disk_read_bytes_total[$interval])) by (device)",
+ "format": "time_series",
+ "hide": false,
+ "intervalFactor": 2,
+ "legendFormat": "OUT on /{{device}}",
+ "metric": "node_disk_bytes_read",
+ "refId": "A",
+ "step": 600
+ },
+ {
+ "expr": "sum(rate(node_disk_written_bytes_total[$interval])) by (device)",
+ "format": "time_series",
+ "intervalFactor": 2,
+ "legendFormat": "IN on /{{device}}",
+ "metric": "",
+ "refId": "B",
+ "step": 600
+ }
+ ],
+ "thresholds": [],
+ "timeFrom": null,
+ "timeRegions": [],
+ "timeShift": null,
+ "title": "Disk I/O",
+ "tooltip": {
+ "msResolution": true,
+ "shared": true,
+ "sort": 0,
+ "value_type": "cumulative"
+ },
+ "type": "graph",
+ "xaxis": {
+ "buckets": null,
+ "mode": "time",
+ "name": null,
+ "show": false,
+ "values": []
+ },
+ "yaxes": [
+ {
+ "format": "Bps",
+ "label": null,
+ "logBase": 1,
+ "max": null,
+ "min": null,
+ "show": true
+ },
+ {
+ "format": "short",
+ "label": null,
+ "logBase": 1,
+ "max": null,
+ "min": null,
+ "show": false
+ }
+ ],
+ "yaxis": {
+ "align": false,
+ "alignLevel": null
+ }
+ },
+ {
+ "aliasColors": {},
+ "bars": false,
+ "dashLength": 10,
+ "dashes": false,
+ "datasource": "${DS_PROMETHEUS}",
+ "editable": true,
+ "error": false,
+ "fill": 1,
+ "grid": {},
+ "gridPos": {
+ "h": 7,
+ "w": 12,
+ "x": 0,
+ "y": 10
+ },
+ "id": 8,
+ "legend": {
+ "alignAsTable": true,
+ "avg": false,
+ "current": false,
+ "max": false,
+ "min": false,
+ "rightSide": true,
+ "show": true,
+ "total": false,
+ "values": false
+ },
+ "lines": true,
+ "linewidth": 2,
+ "links": [],
+ "nullPointMode": "null as zero",
+ "options": {},
+ "percentage": false,
+ "pointradius": 5,
+ "points": false,
+ "renderer": "flot",
+ "seriesOverrides": [],
+ "spaceLength": 10,
+ "stack": false,
+ "steppedLine": false,
+ "targets": [
+ {
+ "expr": "sum(rate(container_network_receive_bytes_total{instance=~\"$node:$port\",job=~\"$job\",image!=\"\"}[$interval])) by (name)",
+ "format": "time_series",
+ "intervalFactor": 2,
+ "legendFormat": "{{name}}",
+ "refId": "A",
+ "step": 240
+ },
+ {
+ "expr": "- rate(container_network_transmit_bytes_total{name=~\".+\"}[$interval])",
+ "format": "time_series",
+ "hide": true,
+ "intervalFactor": 2,
+ "legendFormat": "{{name}}",
+ "refId": "B",
+ "step": 10
+ }
+ ],
+ "thresholds": [],
+ "timeFrom": null,
+ "timeRegions": [],
+ "timeShift": null,
+ "title": "Received Network Traffic per Container",
+ "tooltip": {
+ "msResolution": true,
+ "shared": true,
+ "sort": 0,
+ "value_type": "cumulative"
+ },
+ "type": "graph",
+ "xaxis": {
+ "buckets": null,
+ "mode": "time",
+ "name": null,
+ "show": true,
+ "values": []
+ },
+ "yaxes": [
+ {
+ "format": "Bps",
+ "label": null,
+ "logBase": 1,
+ "max": null,
+ "min": null,
+ "show": true
+ },
+ {
+ "format": "short",
+ "label": null,
+ "logBase": 1,
+ "max": null,
+ "min": null,
+ "show": true
+ }
+ ],
+ "yaxis": {
+ "align": false,
+ "alignLevel": null
+ }
+ },
+ {
+ "aliasColors": {},
+ "bars": false,
+ "dashLength": 10,
+ "dashes": false,
+ "datasource": "${DS_PROMETHEUS}",
+ "editable": true,
+ "error": false,
+ "fill": 1,
+ "grid": {},
+ "gridPos": {
+ "h": 7,
+ "w": 12,
+ "x": 12,
+ "y": 10
+ },
+ "id": 9,
+ "legend": {
+ "alignAsTable": true,
+ "avg": false,
+ "current": false,
+ "hideEmpty": false,
+ "hideZero": false,
+ "max": false,
+ "min": false,
+ "rightSide": true,
+ "show": true,
+ "total": false,
+ "values": false
+ },
+ "lines": true,
+ "linewidth": 2,
+ "links": [],
+ "nullPointMode": "null as zero",
+ "options": {},
+ "percentage": false,
+ "pointradius": 5,
+ "points": false,
+ "renderer": "flot",
+ "seriesOverrides": [],
+ "spaceLength": 10,
+ "stack": false,
+ "steppedLine": false,
+ "targets": [
+ {
+ "expr": "sum(rate(container_network_transmit_bytes_total{instance=~\"$node:$port\",job=~\"$job\",image!=\"\"}[$interval])) by (name)",
+ "format": "time_series",
+ "intervalFactor": 2,
+ "legendFormat": "{{name}}",
+ "refId": "A",
+ "step": 240
+ },
+ {
+ "expr": "rate(container_network_transmit_bytes_total{id=\"/\"}[$interval])",
+ "format": "time_series",
+ "hide": true,
+ "intervalFactor": 2,
+ "legendFormat": "",
+ "refId": "B",
+ "step": 10
+ }
+ ],
+ "thresholds": [],
+ "timeFrom": null,
+ "timeRegions": [],
+ "timeShift": null,
+ "title": "Sent Network Traffic per Container",
+ "tooltip": {
+ "msResolution": true,
+ "shared": true,
+ "sort": 0,
+ "value_type": "cumulative"
+ },
+ "type": "graph",
+ "xaxis": {
+ "buckets": null,
+ "mode": "time",
+ "name": null,
+ "show": true,
+ "values": []
+ },
+ "yaxes": [
+ {
+ "format": "Bps",
+ "label": "",
+ "logBase": 1,
+ "max": null,
+ "min": null,
+ "show": true
+ },
+ {
+ "format": "short",
+ "label": "",
+ "logBase": 10,
+ "max": 8,
+ "min": 0,
+ "show": false
+ }
+ ],
+ "yaxis": {
+ "align": false,
+ "alignLevel": null
+ }
+ },
+ {
+ "aliasColors": {},
+ "bars": false,
+ "dashLength": 10,
+ "dashes": false,
+ "datasource": "${DS_PROMETHEUS}",
+ "editable": true,
+ "error": false,
+ "fill": 5,
+ "grid": {},
+ "gridPos": {
+ "h": 7,
+ "w": 12,
+ "x": 0,
+ "y": 17
+ },
+ "id": 1,
+ "legend": {
+ "alignAsTable": true,
+ "avg": false,
+ "current": false,
+ "max": false,
+ "min": false,
+ "rightSide": true,
+ "show": true,
+ "total": false,
+ "values": false
+ },
+ "lines": true,
+ "linewidth": 1,
+ "links": [],
+ "nullPointMode": "null as zero",
+ "options": {},
+ "percentage": false,
+ "pointradius": 5,
+ "points": false,
+ "renderer": "flot",
+ "seriesOverrides": [],
+ "spaceLength": 10,
+ "stack": false,
+ "steppedLine": false,
+ "targets": [
+ {
+ "expr": "sum(rate(container_cpu_usage_seconds_total{instance=~\"$node:$port\",job=~\"$job\",image!=\"\"}[$interval])) by (name) * 100",
+ "format": "time_series",
+ "hide": false,
+ "interval": "",
+ "intervalFactor": 2,
+ "legendFormat": "{{name}}",
+ "metric": "",
+ "refId": "F",
+ "step": 240
+ }
+ ],
+ "thresholds": [],
+ "timeFrom": null,
+ "timeRegions": [],
+ "timeShift": null,
+ "title": "CPU Usage per Container",
+ "tooltip": {
+ "msResolution": true,
+ "shared": true,
+ "sort": 0,
+ "value_type": "individual"
+ },
+ "type": "graph",
+ "xaxis": {
+ "buckets": null,
+ "mode": "time",
+ "name": null,
+ "show": true,
+ "values": []
+ },
+ "yaxes": [
+ {
+ "format": "percent",
+ "label": "",
+ "logBase": 1,
+ "max": null,
+ "show": true
+ },
+ {
+ "format": "short",
+ "label": null,
+ "logBase": 1,
+ "max": null,
+ "min": null,
+ "show": false
+ }
+ ],
+ "yaxis": {
+ "align": false,
+ "alignLevel": null
+ }
+ },
+ {
+ "aliasColors": {},
+ "bars": false,
+ "dashLength": 10,
+ "dashes": false,
+ "datasource": "${DS_PROMETHEUS}",
+ "editable": true,
+ "error": false,
+ "fill": 3,
+ "grid": {},
+ "gridPos": {
+ "h": 7,
+ "w": 12,
+ "x": 12,
+ "y": 17
+ },
+ "id": 34,
+ "legend": {
+ "alignAsTable": true,
+ "avg": false,
+ "current": false,
+ "max": false,
+ "min": false,
+ "rightSide": true,
+ "show": true,
+ "total": false,
+ "values": false
+ },
+ "lines": true,
+ "linewidth": 2,
+ "links": [],
+ "nullPointMode": "null as zero",
+ "options": {},
+ "percentage": false,
+ "pointradius": 5,
+ "points": false,
+ "renderer": "flot",
+ "seriesOverrides": [],
+ "spaceLength": 10,
+ "stack": true,
+ "steppedLine": false,
+ "targets": [
+ {
+ "expr": "sum(container_memory_swap{instance=~\"$node:$port\",job=~\"$job\",image!=\"\"}) by (name)",
+ "format": "time_series",
+ "hide": false,
+ "intervalFactor": 2,
+ "legendFormat": "{{name}}",
+ "refId": "A",
+ "step": 240
+ },
+ {
+ "expr": "container_memory_usage_bytes{name=~\".+\"}",
+ "format": "time_series",
+ "hide": true,
+ "intervalFactor": 2,
+ "legendFormat": "{{name}}",
+ "refId": "B",
+ "step": 240
+ }
+ ],
+ "thresholds": [],
+ "timeFrom": null,
+ "timeRegions": [],
+ "timeShift": null,
+ "title": "Memory Swap per Container",
+ "tooltip": {
+ "msResolution": true,
+ "shared": true,
+ "sort": 0,
+ "value_type": "individual"
+ },
+ "type": "graph",
+ "xaxis": {
+ "buckets": null,
+ "mode": "time",
+ "name": null,
+ "show": true,
+ "values": []
+ },
+ "yaxes": [
+ {
+ "format": "bytes",
+ "label": "",
+ "logBase": 1,
+ "max": null,
+ "min": null,
+ "show": true
+ },
+ {
+ "format": "short",
+ "label": null,
+ "logBase": 1,
+ "max": null,
+ "min": null,
+ "show": true
+ }
+ ],
+ "yaxis": {
+ "align": false,
+ "alignLevel": null
+ }
+ },
+ {
+ "aliasColors": {},
+ "bars": false,
+ "dashLength": 10,
+ "dashes": false,
+ "datasource": "${DS_PROMETHEUS}",
+ "editable": true,
+ "error": false,
+ "fill": 3,
+ "grid": {},
+ "gridPos": {
+ "h": 7,
+ "w": 12,
+ "x": 0,
+ "y": 24
+ },
+ "id": 10,
+ "legend": {
+ "alignAsTable": true,
+ "avg": false,
+ "current": false,
+ "max": false,
+ "min": false,
+ "rightSide": true,
+ "show": true,
+ "total": false,
+ "values": false
+ },
+ "lines": true,
+ "linewidth": 2,
+ "links": [],
+ "nullPointMode": "null as zero",
+ "options": {},
+ "percentage": false,
+ "pointradius": 5,
+ "points": false,
+ "renderer": "flot",
+ "seriesOverrides": [],
+ "spaceLength": 10,
+ "stack": false,
+ "steppedLine": false,
+ "targets": [
+ {
+ "expr": "sum(container_memory_rss{instance=~\"$node:$port\",job=~\"$job\",image!=\"\"}) by (name)",
+ "format": "time_series",
+ "hide": false,
+ "intervalFactor": 2,
+ "legendFormat": "{{name}}",
+ "refId": "A",
+ "step": 240
+ },
+ {
+ "expr": "container_memory_usage_bytes{name=~\".+\"}",
+ "format": "time_series",
+ "hide": true,
+ "intervalFactor": 2,
+ "legendFormat": "{{name}}",
+ "refId": "B",
+ "step": 240
+ }
+ ],
+ "thresholds": [],
+ "timeFrom": null,
+ "timeRegions": [],
+ "timeShift": null,
+ "title": "Memory Usage per Container",
+ "tooltip": {
+ "msResolution": true,
+ "shared": true,
+ "sort": 0,
+ "value_type": "individual"
+ },
+ "type": "graph",
+ "xaxis": {
+ "buckets": null,
+ "mode": "time",
+ "name": null,
+ "show": true,
+ "values": []
+ },
+ "yaxes": [
+ {
+ "format": "bytes",
+ "label": "",
+ "logBase": 1,
+ "max": null,
+ "min": null,
+ "show": true
+ },
+ {
+ "format": "short",
+ "label": null,
+ "logBase": 1,
+ "max": null,
+ "min": null,
+ "show": true
+ }
+ ],
+ "yaxis": {
+ "align": false,
+ "alignLevel": null
+ }
+ },
+ {
+ "columns": [
+ {
+ "text": "Current",
+ "value": "current"
+ }
+ ],
+ "editable": true,
+ "error": false,
+ "fontSize": "100%",
+ "gridPos": {
+ "h": 7,
+ "w": 12,
+ "x": 12,
+ "y": 24
+ },
+ "id": 36,
+ "links": [],
+ "options": {},
+ "pageSize": null,
+ "scroll": true,
+ "showHeader": true,
+ "sort": {
+ "col": 0,
+ "desc": true
+ },
+ "styles": [
+ {
+ "colorMode": null,
+ "colors": [
+ "rgba(245, 54, 54, 0.9)",
+ "rgba(237, 129, 40, 0.89)",
+ "rgba(50, 172, 45, 0.97)"
+ ],
+ "decimals": 2,
+ "pattern": "/.*/",
+ "thresholds": [
+ "10000000",
+ " 25000000"
+ ],
+ "type": "number",
+ "unit": "decbytes"
+ }
+ ],
+ "targets": [
+ {
+ "expr": "sum(container_spec_memory_limit_bytes{name=~\".+\"} - container_memory_usage_bytes{name=~\".+\"}) by (name) ",
+ "format": "table",
+ "hide": true,
+ "intervalFactor": 2,
+ "legendFormat": "{{name}}",
+ "metric": "",
+ "refId": "A",
+ "step": 240
+ },
+ {
+ "expr": "sum(container_spec_memory_limit_bytes{instance=~\"$node:$port\",job=~\"$job\",image!=\"\"}) by (name) ",
+ "format": "table",
+ "hide": false,
+ "intervalFactor": 2,
+ "legendFormat": "{{name}}",
+ "refId": "B",
+ "step": 240
+ },
+ {
+ "expr": "container_memory_usage_bytes{name=~\".+\"}",
+ "format": "table",
+ "hide": true,
+ "intervalFactor": 2,
+ "legendFormat": "{{name}}",
+ "refId": "C",
+ "step": 240
+ }
+ ],
+ "title": "Limit memory",
+ "transform": "table",
+ "type": "table"
+ }
+ ],
+ "refresh": "5m",
+ "schemaVersion": 18,
+ "style": "dark",
+ "tags": [],
+ "templating": {
+ "list": [
+ {
+ "current": {
+ "selected": false,
+ "text": "default",
+ "value": "default"
+ },
+ "hide": 0,
+ "includeAll": false,
+ "label": "datasource",
+ "multi": false,
+ "name": "DS_PROMETHEUS",
+ "options": [],
+ "query": "prometheus",
+ "refresh": 1,
+ "regex": "",
+ "skipUrlSync": false,
+ "type": "datasource"
+ },
+ {
+ "allValue": null,
+ "current": {},
+ "datasource": "${DS_PROMETHEUS}",
+ "definition": "label_values(container_cpu_user_seconds_total, job)",
+ "hide": 0,
+ "includeAll": false,
+ "label": "Job",
+ "multi": false,
+ "name": "job",
+ "options": [],
+ "query": "label_values(container_cpu_user_seconds_total, job)",
+ "refresh": 1,
+ "regex": "",
+ "skipUrlSync": false,
+ "sort": 1,
+ "tagValuesQuery": "",
+ "tags": [],
+ "tagsQuery": "",
+ "type": "query",
+ "useTags": false
+ },
+ {
+ "allValue": null,
+ "current": {},
+ "datasource": "${DS_PROMETHEUS}",
+ "definition": "label_values(container_cpu_user_seconds_total{job=~\"$job\"}, instance)",
+ "hide": 0,
+ "includeAll": false,
+ "label": "Host:",
+ "multi": false,
+ "name": "node",
+ "options": [],
+ "query": "label_values(container_cpu_user_seconds_total{job=~\"$job\"}, instance)",
+ "refresh": 1,
+ "regex": "/([^:]+):.*/",
+ "skipUrlSync": false,
+ "sort": 1,
+ "tagValuesQuery": null,
+ "tags": [],
+ "tagsQuery": null,
+ "type": "query",
+ "useTags": false
+ },
+ {
+ "allValue": null,
+ "current": {},
+ "datasource": "${DS_PROMETHEUS}",
+ "definition": "label_values(container_cpu_user_seconds_total{instance=~\"$node:(.*)\"}, instance)",
+ "hide": 0,
+ "includeAll": false,
+ "label": "Port",
+ "multi": false,
+ "name": "port",
+ "options": [],
+ "query": "label_values(container_cpu_user_seconds_total{instance=~\"$node:(.*)\"}, instance)",
+ "refresh": 1,
+ "regex": "/[^:]+:(.*)/",
+ "skipUrlSync": false,
+ "sort": 3,
+ "tagValuesQuery": "",
+ "tags": [],
+ "tagsQuery": "",
+ "type": "query",
+ "useTags": false
+ },
+ {
+ "auto": true,
+ "auto_count": 30,
+ "auto_min": "50s",
+ "current": {
+ "text": "1m",
+ "value": "1m"
+ },
+ "hide": 0,
+ "label": "Interval",
+ "name": "interval",
+ "options": [
+ {
+ "selected": false,
+ "text": "auto",
+ "value": "$__auto_interval_interval"
+ },
+ {
+ "selected": true,
+ "text": "1m",
+ "value": "1m"
+ },
+ {
+ "selected": false,
+ "text": "5m",
+ "value": "5m"
+ },
+ {
+ "selected": false,
+ "text": "10m",
+ "value": "10m"
+ },
+ {
+ "selected": false,
+ "text": "30m",
+ "value": "30m"
+ },
+ {
+ "selected": false,
+ "text": "1h",
+ "value": "1h"
+ },
+ {
+ "selected": false,
+ "text": "6h",
+ "value": "6h"
+ },
+ {
+ "selected": false,
+ "text": "12h",
+ "value": "12h"
+ },
+ {
+ "selected": false,
+ "text": "1d",
+ "value": "1d"
+ },
+ {
+ "selected": false,
+ "text": "7d",
+ "value": "7d"
+ },
+ {
+ "selected": false,
+ "text": "14d",
+ "value": "14d"
+ },
+ {
+ "selected": false,
+ "text": "30d",
+ "value": "30d"
+ }
+ ],
+ "query": "1m,5m,10m,30m,1h,6h,12h,1d,7d,14d,30d",
+ "refresh": 2,
+ "skipUrlSync": false,
+ "type": "interval"
+ }
+ ]
+ },
+ "time": {
+ "from": "now-4h",
+ "to": "now"
+ },
+ "timepicker": {
+ "refresh_intervals": [
+ "5s",
+ "10s",
+ "30s",
+ "1m",
+ "5m",
+ "15m",
+ "30m",
+ "1h",
+ "2h",
+ "1d"
+ ],
+ "time_options": [
+ "5m",
+ "15m",
+ "1h",
+ "6h",
+ "12h",
+ "24h",
+ "2d",
+ "7d",
+ "30d"
+ ]
+ },
+ "timezone": "browser",
+ "title": "Docker cAdvisor",
+ "version": 1
+}
diff --git a/terraform-ci-infra/1n_nmd/grafana/conf/node_exporter.json b/terraform-ci-infra/1n_nmd/grafana/conf/node_exporter.json
new file mode 100644
index 0000000000..766d5afec3
--- /dev/null
+++ b/terraform-ci-infra/1n_nmd/grafana/conf/node_exporter.json
@@ -0,0 +1,13696 @@
+{
+ "__inputs": [
+ {
+ "name": "DS_PROMETHEUS",
+ "label": "Prometheus",
+ "description": "",
+ "type": "datasource",
+ "pluginId": "prometheus",
+ "pluginName": "Prometheus"
+ }
+ ],
+ "__requires": [
+ {
+ "type": "panel",
+ "id": "gauge",
+ "name": "Gauge",
+ "version": ""
+ },
+ {
+ "type": "grafana",
+ "id": "grafana",
+ "name": "Grafana",
+ "version": "6.7.3"
+ },
+ {
+ "type": "panel",
+ "id": "graph",
+ "name": "Graph",
+ "version": ""
+ },
+ {
+ "type": "datasource",
+ "id": "prometheus",
+ "name": "Prometheus",
+ "version": "1.0.0"
+ },
+ {
+ "type": "panel",
+ "id": "singlestat",
+ "name": "Singlestat",
+ "version": ""
+ }
+ ],
+ "annotations": {
+ "list": [
+ {
+ "$$hashKey": "object:1058",
+ "builtIn": 1,
+ "datasource": "-- Grafana --",
+ "enable": true,
+ "hide": true,
+ "iconColor": "rgba(0, 211, 255, 1)",
+ "name": "Annotations & Alerts",
+ "type": "dashboard"
+ }
+ ]
+ },
+ "editable": true,
+ "gnetId": 1860,
+ "graphTooltip": 0,
+ "id": null,
+ "iteration": 1595837627257,
+ "links": [],
+ "panels": [
+ {
+ "collapsed": false,
+ "datasource": "${DS_PROMETHEUS}",
+ "gridPos": {
+ "h": 1,
+ "w": 24,
+ "x": 0,
+ "y": 0
+ },
+ "id": 261,
+ "panels": [],
+ "repeat": null,
+ "title": "Quick CPU / Mem / Disk",
+ "type": "row"
+ },
+ {
+ "cacheTimeout": null,
+ "datasource": "${DS_PROMETHEUS}",
+ "description": "Busy state of all CPU cores together",
+ "gridPos": {
+ "h": 4,
+ "w": 3,
+ "x": 0,
+ "y": 1
+ },
+ "id": 20,
+ "links": [],
+ "options": {
+ "fieldOptions": {
+ "calcs": [
+ "lastNotNull"
+ ],
+ "defaults": {
+ "color": {
+ "mode": "thresholds"
+ },
+ "mappings": [
+ {
+ "id": 0,
+ "op": "=",
+ "text": "N/A",
+ "type": 1,
+ "value": "null"
+ }
+ ],
+ "max": 100,
+ "min": 0,
+ "nullValueMode": "null",
+ "thresholds": {
+ "mode": "absolute",
+ "steps": [
+ {
+ "color": "rgba(50, 172, 45, 0.97)",
+ "value": null
+ },
+ {
+ "color": "rgba(237, 129, 40, 0.89)",
+ "value": 85
+ },
+ {
+ "color": "rgba(245, 54, 54, 0.9)",
+ "value": 95
+ }
+ ]
+ },
+ "unit": "percent"
+ },
+ "overrides": [],
+ "values": false
+ },
+ "orientation": "horizontal",
+ "showThresholdLabels": false,
+ "showThresholdMarkers": true
+ },
+ "pluginVersion": "6.7.3",
+ "targets": [
+ {
+ "expr": "(((count(count(node_cpu_seconds_total{instance=\"$node\",job=\"$job\"}) by (cpu))) - avg(sum by (mode)(irate(node_cpu_seconds_total{mode='idle',instance=\"$node\",job=\"$job\"}[5m])))) * 100) / count(count(node_cpu_seconds_total{instance=\"$node\",job=\"$job\"}) by (cpu))",
+ "hide": false,
+ "intervalFactor": 1,
+ "legendFormat": "",
+ "refId": "A",
+ "step": 900
+ }
+ ],
+ "title": "CPU Busy",
+ "type": "gauge"
+ },
+ {
+ "cacheTimeout": null,
+ "datasource": "${DS_PROMETHEUS}",
+ "description": "Busy state of all CPU cores together (5 min average)",
+ "gridPos": {
+ "h": 4,
+ "w": 3,
+ "x": 3,
+ "y": 1
+ },
+ "id": 155,
+ "links": [],
+ "options": {
+ "fieldOptions": {
+ "calcs": [
+ "lastNotNull"
+ ],
+ "defaults": {
+ "color": {
+ "mode": "thresholds"
+ },
+ "mappings": [
+ {
+ "id": 0,
+ "op": "=",
+ "text": "N/A",
+ "type": 1,
+ "value": "null"
+ }
+ ],
+ "max": 100,
+ "min": 0,
+ "nullValueMode": "null",
+ "thresholds": {
+ "mode": "absolute",
+ "steps": [
+ {
+ "color": "rgba(50, 172, 45, 0.97)",
+ "value": null
+ },
+ {
+ "color": "rgba(237, 129, 40, 0.89)",
+ "value": 85
+ },
+ {
+ "color": "rgba(245, 54, 54, 0.9)",
+ "value": 95
+ }
+ ]
+ },
+ "unit": "percent"
+ },
+ "overrides": [],
+ "values": false
+ },
+ "orientation": "horizontal",
+ "showThresholdLabels": false,
+ "showThresholdMarkers": true
+ },
+ "pluginVersion": "6.7.3",
+ "targets": [
+ {
+ "expr": "avg(node_load5{instance=\"$node\",job=\"$job\"}) / count(count(node_cpu_seconds_total{instance=\"$node\",job=\"$job\"}) by (cpu)) * 100",
+ "format": "time_series",
+ "hide": false,
+ "intervalFactor": 1,
+ "refId": "A",
+ "step": 900
+ }
+ ],
+ "title": "Sys Load (5m avg)",
+ "type": "gauge"
+ },
+ {
+ "cacheTimeout": null,
+ "datasource": "${DS_PROMETHEUS}",
+ "description": "Busy state of all CPU cores together (15 min average)",
+ "gridPos": {
+ "h": 4,
+ "w": 3,
+ "x": 6,
+ "y": 1
+ },
+ "id": 19,
+ "links": [],
+ "options": {
+ "fieldOptions": {
+ "calcs": [
+ "lastNotNull"
+ ],
+ "defaults": {
+ "color": {
+ "mode": "thresholds"
+ },
+ "mappings": [
+ {
+ "id": 0,
+ "op": "=",
+ "text": "N/A",
+ "type": 1,
+ "value": "null"
+ }
+ ],
+ "max": 100,
+ "min": 0,
+ "nullValueMode": "null",
+ "thresholds": {
+ "mode": "absolute",
+ "steps": [
+ {
+ "color": "rgba(50, 172, 45, 0.97)",
+ "value": null
+ },
+ {
+ "color": "rgba(237, 129, 40, 0.89)",
+ "value": 85
+ },
+ {
+ "color": "rgba(245, 54, 54, 0.9)",
+ "value": 95
+ }
+ ]
+ },
+ "unit": "percent"
+ },
+ "overrides": [],
+ "values": false
+ },
+ "orientation": "horizontal",
+ "showThresholdLabels": false,
+ "showThresholdMarkers": true
+ },
+ "pluginVersion": "6.7.3",
+ "targets": [
+ {
+ "expr": "avg(node_load15{instance=\"$node\",job=\"$job\"}) / count(count(node_cpu_seconds_total{instance=\"$node\",job=\"$job\"}) by (cpu)) * 100",
+ "hide": false,
+ "intervalFactor": 1,
+ "refId": "A",
+ "step": 900
+ }
+ ],
+ "title": "Sys Load (15m avg)",
+ "type": "gauge"
+ },
+ {
+ "cacheTimeout": null,
+ "datasource": "${DS_PROMETHEUS}",
+ "description": "Non available RAM memory",
+ "gridPos": {
+ "h": 4,
+ "w": 3,
+ "x": 9,
+ "y": 1
+ },
+ "hideTimeOverride": false,
+ "id": 16,
+ "links": [],
+ "options": {
+ "fieldOptions": {
+ "calcs": [
+ "lastNotNull"
+ ],
+ "defaults": {
+ "color": {
+ "mode": "thresholds"
+ },
+ "decimals": 0,
+ "mappings": [],
+ "max": 100,
+ "min": 0,
+ "nullValueMode": "null",
+ "thresholds": {
+ "mode": "absolute",
+ "steps": [
+ {
+ "color": "rgba(50, 172, 45, 0.97)",
+ "value": null
+ },
+ {
+ "color": "rgba(237, 129, 40, 0.89)",
+ "value": 80
+ },
+ {
+ "color": "rgba(245, 54, 54, 0.9)",
+ "value": 90
+ }
+ ]
+ },
+ "unit": "percent"
+ },
+ "overrides": [],
+ "values": false
+ },
+ "orientation": "horizontal",
+ "showThresholdLabels": false,
+ "showThresholdMarkers": true
+ },
+ "pluginVersion": "6.7.3",
+ "targets": [
+ {
+ "expr": "((node_memory_MemTotal_bytes{instance=\"$node\",job=\"$job\"} - node_memory_MemFree_bytes{instance=\"$node\",job=\"$job\"}) / (node_memory_MemTotal_bytes{instance=\"$node\",job=\"$job\"} )) * 100",
+ "format": "time_series",
+ "hide": true,
+ "intervalFactor": 1,
+ "refId": "A",
+ "step": 900
+ },
+ {
+ "expr": "100 - ((node_memory_MemAvailable_bytes{instance=\"$node\",job=\"$job\"} * 100) / node_memory_MemTotal_bytes{instance=\"$node\",job=\"$job\"})",
+ "format": "time_series",
+ "hide": false,
+ "intervalFactor": 1,
+ "refId": "B",
+ "step": 900
+ }
+ ],
+ "title": "RAM Used",
+ "type": "gauge"
+ },
+ {
+ "cacheTimeout": null,
+ "datasource": "${DS_PROMETHEUS}",
+ "description": "Used Swap",
+ "gridPos": {
+ "h": 4,
+ "w": 3,
+ "x": 12,
+ "y": 1
+ },
+ "id": 21,
+ "links": [],
+ "options": {
+ "fieldOptions": {
+ "calcs": [
+ "lastNotNull"
+ ],
+ "defaults": {
+ "color": {
+ "mode": "thresholds"
+ },
+ "mappings": [
+ {
+ "id": 0,
+ "op": "=",
+ "text": "N/A",
+ "type": 1,
+ "value": "null"
+ }
+ ],
+ "max": 100,
+ "min": 0,
+ "nullValueMode": "null",
+ "thresholds": {
+ "mode": "absolute",
+ "steps": [
+ {
+ "color": "rgba(50, 172, 45, 0.97)",
+ "value": null
+ },
+ {
+ "color": "rgba(237, 129, 40, 0.89)",
+ "value": 10
+ },
+ {
+ "color": "rgba(245, 54, 54, 0.9)",
+ "value": 25
+ }
+ ]
+ },
+ "unit": "percent"
+ },
+ "overrides": [],
+ "values": false
+ },
+ "orientation": "horizontal",
+ "showThresholdLabels": false,
+ "showThresholdMarkers": true
+ },
+ "pluginVersion": "6.7.3",
+ "targets": [
+ {
+ "expr": "((node_memory_SwapTotal_bytes{instance=\"$node\",job=\"$job\"} - node_memory_SwapFree_bytes{instance=\"$node\",job=\"$job\"}) / (node_memory_SwapTotal_bytes{instance=\"$node\",job=\"$job\"} )) * 100",
+ "intervalFactor": 1,
+ "refId": "A",
+ "step": 900
+ }
+ ],
+ "title": "SWAP Used",
+ "type": "gauge"
+ },
+ {
+ "cacheTimeout": null,
+ "datasource": "${DS_PROMETHEUS}",
+ "description": "Used Root FS",
+ "gridPos": {
+ "h": 4,
+ "w": 3,
+ "x": 15,
+ "y": 1
+ },
+ "id": 154,
+ "links": [],
+ "options": {
+ "fieldOptions": {
+ "calcs": [
+ "lastNotNull"
+ ],
+ "defaults": {
+ "color": {
+ "mode": "thresholds"
+ },
+ "mappings": [
+ {
+ "id": 0,
+ "op": "=",
+ "text": "N/A",
+ "type": 1,
+ "value": "null"
+ }
+ ],
+ "max": 100,
+ "min": 0,
+ "nullValueMode": "null",
+ "thresholds": {
+ "mode": "absolute",
+ "steps": [
+ {
+ "color": "rgba(50, 172, 45, 0.97)",
+ "value": null
+ },
+ {
+ "color": "rgba(237, 129, 40, 0.89)",
+ "value": 80
+ },
+ {
+ "color": "rgba(245, 54, 54, 0.9)",
+ "value": 90
+ }
+ ]
+ },
+ "unit": "percent"
+ },
+ "overrides": [],
+ "values": false
+ },
+ "orientation": "horizontal",
+ "showThresholdLabels": false,
+ "showThresholdMarkers": true
+ },
+ "pluginVersion": "6.7.3",
+ "targets": [
+ {
+ "expr": "100 - ((node_filesystem_avail_bytes{instance=\"$node\",job=\"$job\",mountpoint=\"/\",fstype!=\"rootfs\"} * 100) / node_filesystem_size_bytes{instance=\"$node\",job=\"$job\",mountpoint=\"/\",fstype!=\"rootfs\"})",
+ "format": "time_series",
+ "intervalFactor": 1,
+ "refId": "A",
+ "step": 900
+ }
+ ],
+ "title": "Root FS Used",
+ "type": "gauge"
+ },
+ {
+ "cacheTimeout": null,
+ "colorBackground": false,
+ "colorValue": false,
+ "colors": [
+ "rgba(245, 54, 54, 0.9)",
+ "rgba(237, 129, 40, 0.89)",
+ "rgba(50, 172, 45, 0.97)"
+ ],
+ "datasource": "${DS_PROMETHEUS}",
+ "description": "Total number of CPU cores",
+ "format": "short",
+ "gauge": {
+ "maxValue": 100,
+ "minValue": 0,
+ "show": false,
+ "thresholdLabels": false,
+ "thresholdMarkers": true
+ },
+ "gridPos": {
+ "h": 2,
+ "w": 2,
+ "x": 18,
+ "y": 1
+ },
+ "id": 14,
+ "interval": null,
+ "links": [],
+ "mappingType": 1,
+ "mappingTypes": [
+ {
+ "name": "value to text",
+ "value": 1
+ },
+ {
+ "name": "range to text",
+ "value": 2
+ }
+ ],
+ "maxDataPoints": 100,
+ "maxPerRow": 6,
+ "nullPointMode": "null",
+ "nullText": null,
+ "postfix": "",
+ "postfixFontSize": "50%",
+ "prefix": "",
+ "prefixFontSize": "50%",
+ "rangeMaps": [
+ {
+ "from": "null",
+ "text": "N/A",
+ "to": "null"
+ }
+ ],
+ "sparkline": {
+ "fillColor": "rgba(31, 118, 189, 0.18)",
+ "full": false,
+ "lineColor": "rgb(31, 120, 193)",
+ "show": false
+ },
+ "tableColumn": "",
+ "targets": [
+ {
+ "expr": "count(count(node_cpu_seconds_total{instance=\"$node\",job=\"$job\"}) by (cpu))",
+ "interval": "",
+ "intervalFactor": 1,
+ "legendFormat": "",
+ "refId": "A",
+ "step": 900
+ }
+ ],
+ "thresholds": "",
+ "title": "CPU Cores",
+ "type": "singlestat",
+ "valueFontSize": "50%",
+ "valueMaps": [
+ {
+ "op": "=",
+ "text": "N/A",
+ "value": "null"
+ }
+ ],
+ "valueName": "current"
+ },
+ {
+ "cacheTimeout": null,
+ "colorBackground": false,
+ "colorValue": false,
+ "colors": [
+ "rgba(245, 54, 54, 0.9)",
+ "rgba(237, 129, 40, 0.89)",
+ "rgba(50, 172, 45, 0.97)"
+ ],
+ "datasource": "${DS_PROMETHEUS}",
+ "decimals": 1,
+ "description": "System uptime",
+ "format": "s",
+ "gauge": {
+ "maxValue": 100,
+ "minValue": 0,
+ "show": false,
+ "thresholdLabels": false,
+ "thresholdMarkers": true
+ },
+ "gridPos": {
+ "h": 2,
+ "w": 4,
+ "x": 20,
+ "y": 1
+ },
+ "hideTimeOverride": true,
+ "id": 15,
+ "interval": null,
+ "links": [],
+ "mappingType": 1,
+ "mappingTypes": [
+ {
+ "$$hashKey": "object:1094",
+ "name": "value to text",
+ "value": 1
+ },
+ {
+ "$$hashKey": "object:1095",
+ "name": "range to text",
+ "value": 2
+ }
+ ],
+ "maxDataPoints": 100,
+ "nullPointMode": "null",
+ "nullText": null,
+ "postfix": "s",
+ "postfixFontSize": "50%",
+ "prefix": "",
+ "prefixFontSize": "50%",
+ "rangeMaps": [
+ {
+ "from": "null",
+ "text": "N/A",
+ "to": "null"
+ }
+ ],
+ "sparkline": {
+ "fillColor": "rgba(31, 118, 189, 0.18)",
+ "full": false,
+ "lineColor": "rgb(31, 120, 193)",
+ "show": false
+ },
+ "tableColumn": "",
+ "targets": [
+ {
+ "expr": "node_time_seconds{instance=\"$node\",job=\"$job\"} - node_boot_time_seconds{instance=\"$node\",job=\"$job\"}",
+ "intervalFactor": 2,
+ "refId": "A",
+ "step": 1800
+ }
+ ],
+ "thresholds": "",
+ "title": "Uptime",
+ "type": "singlestat",
+ "valueFontSize": "50%",
+ "valueMaps": [
+ {
+ "$$hashKey": "object:1097",
+ "op": "=",
+ "text": "N/A",
+ "value": "null"
+ }
+ ],
+ "valueName": "current"
+ },
+ {
+ "cacheTimeout": null,
+ "colorBackground": false,
+ "colorValue": false,
+ "colors": [
+ "rgba(50, 172, 45, 0.97)",
+ "rgba(237, 129, 40, 0.89)",
+ "rgba(245, 54, 54, 0.9)"
+ ],
+ "datasource": "${DS_PROMETHEUS}",
+ "decimals": 0,
+ "description": "Total RootFS",
+ "format": "bytes",
+ "gauge": {
+ "maxValue": 100,
+ "minValue": 0,
+ "show": false,
+ "thresholdLabels": false,
+ "thresholdMarkers": true
+ },
+ "gridPos": {
+ "h": 2,
+ "w": 2,
+ "x": 18,
+ "y": 3
+ },
+ "id": 23,
+ "interval": null,
+ "links": [],
+ "mappingType": 1,
+ "mappingTypes": [
+ {
+ "name": "value to text",
+ "value": 1
+ },
+ {
+ "name": "range to text",
+ "value": 2
+ }
+ ],
+ "maxDataPoints": 100,
+ "maxPerRow": 6,
+ "nullPointMode": "null",
+ "nullText": null,
+ "postfix": "",
+ "postfixFontSize": "50%",
+ "prefix": "",
+ "prefixFontSize": "50%",
+ "rangeMaps": [
+ {
+ "from": "null",
+ "text": "N/A",
+ "to": "null"
+ }
+ ],
+ "sparkline": {
+ "fillColor": "rgba(31, 118, 189, 0.18)",
+ "full": false,
+ "lineColor": "rgb(31, 120, 193)",
+ "show": false
+ },
+ "tableColumn": "",
+ "targets": [
+ {
+ "expr": "node_filesystem_size_bytes{instance=\"$node\",job=\"$job\",mountpoint=\"/\",fstype!=\"rootfs\"}",
+ "format": "time_series",
+ "hide": false,
+ "intervalFactor": 1,
+ "refId": "A",
+ "step": 900
+ }
+ ],
+ "thresholds": "70,90",
+ "title": "RootFS Total",
+ "type": "singlestat",
+ "valueFontSize": "50%",
+ "valueMaps": [
+ {
+ "op": "=",
+ "text": "N/A",
+ "value": "null"
+ }
+ ],
+ "valueName": "current"
+ },
+ {
+ "cacheTimeout": null,
+ "colorBackground": false,
+ "colorValue": false,
+ "colors": [
+ "rgba(245, 54, 54, 0.9)",
+ "rgba(237, 129, 40, 0.89)",
+ "rgba(50, 172, 45, 0.97)"
+ ],
+ "datasource": "${DS_PROMETHEUS}",
+ "decimals": 0,
+ "description": "Total RAM",
+ "format": "bytes",
+ "gauge": {
+ "maxValue": 100,
+ "minValue": 0,
+ "show": false,
+ "thresholdLabels": false,
+ "thresholdMarkers": true
+ },
+ "gridPos": {
+ "h": 2,
+ "w": 2,
+ "x": 20,
+ "y": 3
+ },
+ "id": 75,
+ "interval": null,
+ "links": [],
+ "mappingType": 1,
+ "mappingTypes": [
+ {
+ "name": "value to text",
+ "value": 1
+ },
+ {
+ "name": "range to text",
+ "value": 2
+ }
+ ],
+ "maxDataPoints": 100,
+ "maxPerRow": 6,
+ "nullPointMode": "null",
+ "nullText": null,
+ "postfix": "",
+ "postfixFontSize": "70%",
+ "prefix": "",
+ "prefixFontSize": "50%",
+ "rangeMaps": [
+ {
+ "from": "null",
+ "text": "N/A",
+ "to": "null"
+ }
+ ],
+ "sparkline": {
+ "fillColor": "rgba(31, 118, 189, 0.18)",
+ "full": false,
+ "lineColor": "rgb(31, 120, 193)",
+ "show": false
+ },
+ "tableColumn": "",
+ "targets": [
+ {
+ "expr": "node_memory_MemTotal_bytes{instance=\"$node\",job=\"$job\"}",
+ "intervalFactor": 1,
+ "refId": "A",
+ "step": 900
+ }
+ ],
+ "thresholds": "",
+ "title": "RAM Total",
+ "type": "singlestat",
+ "valueFontSize": "50%",
+ "valueMaps": [
+ {
+ "op": "=",
+ "text": "N/A",
+ "value": "null"
+ }
+ ],
+ "valueName": "current"
+ },
+ {
+ "cacheTimeout": null,
+ "colorBackground": false,
+ "colorValue": false,
+ "colors": [
+ "rgba(245, 54, 54, 0.9)",
+ "rgba(237, 129, 40, 0.89)",
+ "rgba(50, 172, 45, 0.97)"
+ ],
+ "datasource": "${DS_PROMETHEUS}",
+ "decimals": 0,
+ "description": "Total SWAP",
+ "format": "bytes",
+ "gauge": {
+ "maxValue": 100,
+ "minValue": 0,
+ "show": false,
+ "thresholdLabels": false,
+ "thresholdMarkers": true
+ },
+ "gridPos": {
+ "h": 2,
+ "w": 2,
+ "x": 22,
+ "y": 3
+ },
+ "id": 18,
+ "interval": null,
+ "links": [],
+ "mappingType": 1,
+ "mappingTypes": [
+ {
+ "name": "value to text",
+ "value": 1
+ },
+ {
+ "name": "range to text",
+ "value": 2
+ }
+ ],
+ "maxDataPoints": 100,
+ "maxPerRow": 6,
+ "nullPointMode": "null",
+ "nullText": null,
+ "postfix": "",
+ "postfixFontSize": "70%",
+ "prefix": "",
+ "prefixFontSize": "50%",
+ "rangeMaps": [
+ {
+ "from": "null",
+ "text": "N/A",
+ "to": "null"
+ }
+ ],
+ "sparkline": {
+ "fillColor": "rgba(31, 118, 189, 0.18)",
+ "full": false,
+ "lineColor": "rgb(31, 120, 193)",
+ "show": false
+ },
+ "tableColumn": "",
+ "targets": [
+ {
+ "expr": "node_memory_SwapTotal_bytes{instance=\"$node\",job=\"$job\"}",
+ "intervalFactor": 1,
+ "refId": "A",
+ "step": 900
+ }
+ ],
+ "thresholds": "",
+ "title": "SWAP Total",
+ "type": "singlestat",
+ "valueFontSize": "50%",
+ "valueMaps": [
+ {
+ "op": "=",
+ "text": "N/A",
+ "value": "null"
+ }
+ ],
+ "valueName": "current"
+ },
+ {
+ "collapsed": false,
+ "datasource": "${DS_PROMETHEUS}",
+ "gridPos": {
+ "h": 1,
+ "w": 24,
+ "x": 0,
+ "y": 5
+ },
+ "id": 263,
+ "panels": [],
+ "repeat": null,
+ "title": "Basic CPU / Mem / Net / Disk",
+ "type": "row"
+ },
+ {
+ "aliasColors": {
+ "Busy": "#EAB839",
+ "Busy Iowait": "#890F02",
+ "Busy other": "#1F78C1",
+ "Idle": "#052B51",
+ "Idle - Waiting for something to happen": "#052B51",
+ "guest": "#9AC48A",
+ "idle": "#052B51",
+ "iowait": "#EAB839",
+ "irq": "#BF1B00",
+ "nice": "#C15C17",
+ "softirq": "#E24D42",
+ "steal": "#FCE2DE",
+ "system": "#508642",
+ "user": "#5195CE"
+ },
+ "bars": false,
+ "dashLength": 10,
+ "dashes": false,
+ "datasource": "${DS_PROMETHEUS}",
+ "decimals": 2,
+ "description": "Basic CPU info",
+ "fill": 4,
+ "fillGradient": 0,
+ "gridPos": {
+ "h": 7,
+ "w": 12,
+ "x": 0,
+ "y": 6
+ },
+ "hiddenSeries": false,
+ "id": 77,
+ "legend": {
+ "alignAsTable": false,
+ "avg": false,
+ "current": false,
+ "max": false,
+ "min": false,
+ "rightSide": false,
+ "show": true,
+ "sideWidth": 250,
+ "sort": null,
+ "sortDesc": null,
+ "total": false,
+ "values": false
+ },
+ "lines": true,
+ "linewidth": 1,
+ "links": [],
+ "maxPerRow": 6,
+ "nullPointMode": "null",
+ "options": {
+ "dataLinks": []
+ },
+ "percentage": true,
+ "pointradius": 5,
+ "points": false,
+ "renderer": "flot",
+ "seriesOverrides": [
+ {
+ "alias": "Busy Iowait",
+ "color": "#890F02"
+ },
+ {
+ "alias": "Idle",
+ "color": "#7EB26D"
+ },
+ {
+ "alias": "Busy System",
+ "color": "#EAB839"
+ },
+ {
+ "alias": "Busy User",
+ "color": "#0A437C"
+ },
+ {
+ "alias": "Busy Other",
+ "color": "#6D1F62"
+ }
+ ],
+ "spaceLength": 10,
+ "stack": true,
+ "steppedLine": false,
+ "targets": [
+ {
+ "expr": "sum by (instance)(irate(node_cpu_seconds_total{mode=\"system\",instance=\"$node\",job=\"$job\"}[5m])) * 100",
+ "format": "time_series",
+ "hide": false,
+ "intervalFactor": 2,
+ "legendFormat": "Busy System",
+ "refId": "A",
+ "step": 240
+ },
+ {
+ "expr": "sum by (instance)(irate(node_cpu_seconds_total{mode='user',instance=\"$node\",job=\"$job\"}[5m])) * 100",
+ "format": "time_series",
+ "hide": false,
+ "intervalFactor": 2,
+ "legendFormat": "Busy User",
+ "refId": "B",
+ "step": 240
+ },
+ {
+ "expr": "sum by (instance)(irate(node_cpu_seconds_total{mode='iowait',instance=\"$node\",job=\"$job\"}[5m])) * 100",
+ "format": "time_series",
+ "intervalFactor": 2,
+ "legendFormat": "Busy Iowait",
+ "refId": "C",
+ "step": 240
+ },
+ {
+ "expr": "sum by (instance)(irate(node_cpu_seconds_total{mode=~\".*irq\",instance=\"$node\",job=\"$job\"}[5m])) * 100",
+ "format": "time_series",
+ "intervalFactor": 2,
+ "legendFormat": "Busy IRQs",
+ "refId": "D",
+ "step": 240
+ },
+ {
+ "expr": "sum (irate(node_cpu_seconds_total{mode!='idle',mode!='user',mode!='system',mode!='iowait',mode!='irq',mode!='softirq',instance=\"$node\",job=\"$job\"}[5m])) * 100",
+ "format": "time_series",
+ "intervalFactor": 2,
+ "legendFormat": "Busy Other",
+ "refId": "E",
+ "step": 240
+ },
+ {
+ "expr": "sum by (mode)(irate(node_cpu_seconds_total{mode='idle',instance=\"$node\",job=\"$job\"}[5m])) * 100",
+ "format": "time_series",
+ "intervalFactor": 2,
+ "legendFormat": "Idle",
+ "refId": "F",
+ "step": 240
+ }
+ ],
+ "thresholds": [],
+ "timeFrom": null,
+ "timeRegions": [],
+ "timeShift": null,
+ "title": "CPU Basic",
+ "tooltip": {
+ "shared": true,
+ "sort": 0,
+ "value_type": "individual"
+ },
+ "type": "graph",
+ "xaxis": {
+ "buckets": null,
+ "mode": "time",
+ "name": null,
+ "show": true,
+ "values": []
+ },
+ "yaxes": [
+ {
+ "format": "short",
+ "label": "",
+ "logBase": 1,
+ "max": "100",
+ "min": "0",
+ "show": true
+ },
+ {
+ "format": "short",
+ "label": null,
+ "logBase": 1,
+ "max": null,
+ "min": null,
+ "show": false
+ }
+ ],
+ "yaxis": {
+ "align": false,
+ "alignLevel": null
+ }
+ },
+ {
+ "aliasColors": {
+ "Apps": "#629E51",
+ "Buffers": "#614D93",
+ "Cache": "#6D1F62",
+ "Cached": "#511749",
+ "Committed": "#508642",
+ "Free": "#0A437C",
+ "Hardware Corrupted - Amount of RAM that the kernel identified as corrupted / not working": "#CFFAFF",
+ "Inactive": "#584477",
+ "PageTables": "#0A50A1",
+ "Page_Tables": "#0A50A1",
+ "RAM_Free": "#E0F9D7",
+ "SWAP Used": "#BF1B00",
+ "Slab": "#806EB7",
+ "Slab_Cache": "#E0752D",
+ "Swap": "#BF1B00",
+ "Swap Used": "#BF1B00",
+ "Swap_Cache": "#C15C17",
+ "Swap_Free": "#2F575E",
+ "Unused": "#EAB839"
+ },
+ "bars": false,
+ "dashLength": 10,
+ "dashes": false,
+ "datasource": "${DS_PROMETHEUS}",
+ "decimals": 2,
+ "description": "Basic memory usage",
+ "fill": 4,
+ "fillGradient": 0,
+ "gridPos": {
+ "h": 7,
+ "w": 12,
+ "x": 12,
+ "y": 6
+ },
+ "hiddenSeries": false,
+ "id": 78,
+ "legend": {
+ "alignAsTable": false,
+ "avg": false,
+ "current": false,
+ "max": false,
+ "min": false,
+ "rightSide": false,
+ "show": true,
+ "sideWidth": 350,
+ "total": false,
+ "values": false
+ },
+ "lines": true,
+ "linewidth": 1,
+ "links": [],
+ "maxPerRow": 6,
+ "nullPointMode": "null",
+ "options": {
+ "dataLinks": []
+ },
+ "percentage": false,
+ "pointradius": 5,
+ "points": false,
+ "renderer": "flot",
+ "seriesOverrides": [
+ {
+ "alias": "RAM Total",
+ "color": "#E0F9D7",
+ "fill": 0,
+ "stack": false
+ },
+ {
+ "alias": "RAM Cache + Buffer",
+ "color": "#052B51"
+ },
+ {
+ "alias": "RAM Free",
+ "color": "#7EB26D"
+ },
+ {
+ "alias": "Avaliable",
+ "color": "#DEDAF7",
+ "fill": 0,
+ "stack": false
+ }
+ ],
+ "spaceLength": 10,
+ "stack": true,
+ "steppedLine": false,
+ "targets": [
+ {
+ "expr": "node_memory_MemTotal_bytes{instance=\"$node\",job=\"$job\"}",
+ "format": "time_series",
+ "hide": false,
+ "intervalFactor": 2,
+ "legendFormat": "RAM Total",
+ "refId": "A",
+ "step": 240
+ },
+ {
+ "expr": "node_memory_MemTotal_bytes{instance=\"$node\",job=\"$job\"} - node_memory_MemFree_bytes{instance=\"$node\",job=\"$job\"} - (node_memory_Cached_bytes{instance=\"$node\",job=\"$job\"} + node_memory_Buffers_bytes{instance=\"$node\",job=\"$job\"})",
+ "format": "time_series",
+ "hide": false,
+ "intervalFactor": 2,
+ "legendFormat": "RAM Used",
+ "refId": "B",
+ "step": 240
+ },
+ {
+ "expr": "node_memory_Cached_bytes{instance=\"$node\",job=\"$job\"} + node_memory_Buffers_bytes{instance=\"$node\",job=\"$job\"}",
+ "format": "time_series",
+ "intervalFactor": 2,
+ "legendFormat": "RAM Cache + Buffer",
+ "refId": "C",
+ "step": 240
+ },
+ {
+ "expr": "node_memory_MemFree_bytes{instance=\"$node\",job=\"$job\"}",
+ "format": "time_series",
+ "intervalFactor": 2,
+ "legendFormat": "RAM Free",
+ "refId": "D",
+ "step": 240
+ },
+ {
+ "expr": "(node_memory_SwapTotal_bytes{instance=\"$node\",job=\"$job\"} - node_memory_SwapFree_bytes{instance=\"$node\",job=\"$job\"})",
+ "format": "time_series",
+ "intervalFactor": 2,
+ "legendFormat": "SWAP Used",
+ "refId": "E",
+ "step": 240
+ }
+ ],
+ "thresholds": [],
+ "timeFrom": null,
+ "timeRegions": [],
+ "timeShift": null,
+ "title": "Memory Basic",
+ "tooltip": {
+ "shared": true,
+ "sort": 0,
+ "value_type": "individual"
+ },
+ "type": "graph",
+ "xaxis": {
+ "buckets": null,
+ "mode": "time",
+ "name": null,
+ "show": true,
+ "values": []
+ },
+ "yaxes": [
+ {
+ "format": "bytes",
+ "label": "",
+ "logBase": 1,
+ "max": null,
+ "min": "0",
+ "show": true
+ },
+ {
+ "format": "short",
+ "label": null,
+ "logBase": 1,
+ "max": null,
+ "min": null,
+ "show": false
+ }
+ ],
+ "yaxis": {
+ "align": false,
+ "alignLevel": null
+ }
+ },
+ {
+ "aliasColors": {
+ "Recv_bytes_eth2": "#7EB26D",
+ "Recv_bytes_lo": "#0A50A1",
+ "Recv_drop_eth2": "#6ED0E0",
+ "Recv_drop_lo": "#E0F9D7",
+ "Recv_errs_eth2": "#BF1B00",
+ "Recv_errs_lo": "#CCA300",
+ "Trans_bytes_eth2": "#7EB26D",
+ "Trans_bytes_lo": "#0A50A1",
+ "Trans_drop_eth2": "#6ED0E0",
+ "Trans_drop_lo": "#E0F9D7",
+ "Trans_errs_eth2": "#BF1B00",
+ "Trans_errs_lo": "#CCA300",
+ "recv_bytes_lo": "#0A50A1",
+ "recv_drop_eth0": "#99440A",
+ "recv_drop_lo": "#967302",
+ "recv_errs_eth0": "#BF1B00",
+ "recv_errs_lo": "#890F02",
+ "trans_bytes_eth0": "#7EB26D",
+ "trans_bytes_lo": "#0A50A1",
+ "trans_drop_eth0": "#99440A",
+ "trans_drop_lo": "#967302",
+ "trans_errs_eth0": "#BF1B00",
+ "trans_errs_lo": "#890F02"
+ },
+ "bars": false,
+ "dashLength": 10,
+ "dashes": false,
+ "datasource": "${DS_PROMETHEUS}",
+ "description": "Basic network info per interface",
+ "fill": 4,
+ "fillGradient": 0,
+ "gridPos": {
+ "h": 7,
+ "w": 12,
+ "x": 0,
+ "y": 13
+ },
+ "hiddenSeries": false,
+ "id": 74,
+ "legend": {
+ "alignAsTable": false,
+ "avg": false,
+ "current": false,
+ "hideEmpty": false,
+ "hideZero": false,
+ "max": false,
+ "min": false,
+ "rightSide": false,
+ "show": true,
+ "sort": "current",
+ "sortDesc": true,
+ "total": false,
+ "values": false
+ },
+ "lines": true,
+ "linewidth": 1,
+ "links": [],
+ "nullPointMode": "null",
+ "options": {
+ "dataLinks": []
+ },
+ "percentage": false,
+ "pointradius": 5,
+ "points": false,
+ "renderer": "flot",
+ "seriesOverrides": [
+ {
+ "alias": "/.*trans.*/",
+ "transform": "negative-Y"
+ }
+ ],
+ "spaceLength": 10,
+ "stack": false,
+ "steppedLine": false,
+ "targets": [
+ {
+ "expr": "irate(node_network_receive_bytes_total{instance=\"$node\",job=\"$job\"}[5m])*8",
+ "format": "time_series",
+ "intervalFactor": 2,
+ "legendFormat": "recv {{device}}",
+ "refId": "A",
+ "step": 240
+ },
+ {
+ "expr": "irate(node_network_transmit_bytes_total{instance=\"$node\",job=\"$job\"}[5m])*8",
+ "format": "time_series",
+ "intervalFactor": 2,
+ "legendFormat": "trans {{device}} ",
+ "refId": "B",
+ "step": 240
+ }
+ ],
+ "thresholds": [],
+ "timeFrom": null,
+ "timeRegions": [],
+ "timeShift": null,
+ "title": "Network Traffic Basic",
+ "tooltip": {
+ "shared": true,
+ "sort": 0,
+ "value_type": "individual"
+ },
+ "type": "graph",
+ "xaxis": {
+ "buckets": null,
+ "mode": "time",
+ "name": null,
+ "show": true,
+ "values": []
+ },
+ "yaxes": [
+ {
+ "format": "bps",
+ "label": null,
+ "logBase": 1,
+ "max": null,
+ "min": null,
+ "show": true
+ },
+ {
+ "format": "pps",
+ "label": "",
+ "logBase": 1,
+ "max": null,
+ "min": null,
+ "show": false
+ }
+ ],
+ "yaxis": {
+ "align": false,
+ "alignLevel": null
+ }
+ },
+ {
+ "aliasColors": {},
+ "bars": false,
+ "dashLength": 10,
+ "dashes": false,
+ "datasource": "${DS_PROMETHEUS}",
+ "decimals": 3,
+ "description": "Disk space used of all filesystems mounted",
+ "fill": 4,
+ "fillGradient": 0,
+ "gridPos": {
+ "h": 7,
+ "w": 12,
+ "x": 12,
+ "y": 13
+ },
+ "height": "",
+ "hiddenSeries": false,
+ "id": 152,
+ "legend": {
+ "alignAsTable": false,
+ "avg": false,
+ "current": false,
+ "max": false,
+ "min": false,
+ "rightSide": false,
+ "show": true,
+ "sort": "current",
+ "sortDesc": false,
+ "total": false,
+ "values": false
+ },
+ "lines": true,
+ "linewidth": 1,
+ "links": [],
+ "maxPerRow": 6,
+ "nullPointMode": "null",
+ "options": {
+ "dataLinks": []
+ },
+ "percentage": false,
+ "pointradius": 5,
+ "points": false,
+ "renderer": "flot",
+ "seriesOverrides": [],
+ "spaceLength": 10,
+ "stack": false,
+ "steppedLine": false,
+ "targets": [
+ {
+ "expr": "100 - ((node_filesystem_avail_bytes{instance=\"$node\",job=\"$job\",device!~'rootfs'} * 100) / node_filesystem_size_bytes{instance=\"$node\",job=\"$job\",device!~'rootfs'})",
+ "format": "time_series",
+ "intervalFactor": 2,
+ "legendFormat": "{{mountpoint}}",
+ "refId": "A",
+ "step": 240
+ }
+ ],
+ "thresholds": [],
+ "timeFrom": null,
+ "timeRegions": [],
+ "timeShift": null,
+ "title": "Disk Space Used Basic",
+ "tooltip": {
+ "shared": true,
+ "sort": 0,
+ "value_type": "individual"
+ },
+ "type": "graph",
+ "xaxis": {
+ "buckets": null,
+ "mode": "time",
+ "name": null,
+ "show": true,
+ "values": []
+ },
+ "yaxes": [
+ {
+ "format": "percent",
+ "label": null,
+ "logBase": 1,
+ "max": "100",
+ "min": "0",
+ "show": true
+ },
+ {
+ "format": "short",
+ "label": null,
+ "logBase": 1,
+ "max": null,
+ "min": null,
+ "show": true
+ }
+ ],
+ "yaxis": {
+ "align": false,
+ "alignLevel": null
+ }
+ },
+ {
+ "collapsed": true,
+ "datasource": "${DS_PROMETHEUS}",
+ "gridPos": {
+ "h": 1,
+ "w": 24,
+ "x": 0,
+ "y": 20
+ },
+ "id": 265,
+ "panels": [
+ {
+ "aliasColors": {
+ "Idle - Waiting for something to happen": "#052B51",
+ "guest": "#9AC48A",
+ "idle": "#052B51",
+ "iowait": "#EAB839",
+ "irq": "#BF1B00",
+ "nice": "#C15C17",
+ "softirq": "#E24D42",
+ "steal": "#FCE2DE",
+ "system": "#508642",
+ "user": "#5195CE"
+ },
+ "bars": false,
+ "dashLength": 10,
+ "dashes": false,
+ "datasource": "${DS_PROMETHEUS}",
+ "decimals": 2,
+ "description": "",
+ "fill": 4,
+ "fillGradient": 0,
+ "gridPos": {
+ "h": 12,
+ "w": 12,
+ "x": 0,
+ "y": 21
+ },
+ "hiddenSeries": false,
+ "id": 3,
+ "legend": {
+ "alignAsTable": true,
+ "avg": true,
+ "current": true,
+ "max": true,
+ "min": true,
+ "rightSide": false,
+ "show": true,
+ "sideWidth": 250,
+ "sort": null,
+ "sortDesc": null,
+ "total": false,
+ "values": true
+ },
+ "lines": true,
+ "linewidth": 1,
+ "links": [],
+ "maxPerRow": 6,
+ "nullPointMode": "null",
+ "options": {
+ "dataLinks": []
+ },
+ "percentage": true,
+ "pointradius": 5,
+ "points": false,
+ "renderer": "flot",
+ "repeat": null,
+ "seriesOverrides": [],
+ "spaceLength": 10,
+ "stack": true,
+ "steppedLine": false,
+ "targets": [
+ {
+ "expr": "sum by (mode)(irate(node_cpu_seconds_total{mode=\"system\",instance=\"$node\",job=\"$job\"}[5m])) * 100",
+ "format": "time_series",
+ "interval": "10s",
+ "intervalFactor": 2,
+ "legendFormat": "System - Processes executing in kernel mode",
+ "refId": "A",
+ "step": 20
+ },
+ {
+ "expr": "sum by (mode)(irate(node_cpu_seconds_total{mode='user',instance=\"$node\",job=\"$job\"}[5m])) * 100",
+ "format": "time_series",
+ "intervalFactor": 2,
+ "legendFormat": "User - Normal processes executing in user mode",
+ "refId": "B",
+ "step": 240
+ },
+ {
+ "expr": "sum by (mode)(irate(node_cpu_seconds_total{mode='nice',instance=\"$node\",job=\"$job\"}[5m])) * 100",
+ "format": "time_series",
+ "intervalFactor": 2,
+ "legendFormat": "Nice - Niced processes executing in user mode",
+ "refId": "C",
+ "step": 240
+ },
+ {
+ "expr": "sum by (mode)(irate(node_cpu_seconds_total{mode='idle',instance=\"$node\",job=\"$job\"}[5m])) * 100",
+ "format": "time_series",
+ "intervalFactor": 2,
+ "legendFormat": "Idle - Waiting for something to happen",
+ "refId": "D",
+ "step": 240
+ },
+ {
+ "expr": "sum by (mode)(irate(node_cpu_seconds_total{mode='iowait',instance=\"$node\",job=\"$job\"}[5m])) * 100",
+ "format": "time_series",
+ "intervalFactor": 2,
+ "legendFormat": "Iowait - Waiting for I/O to complete",
+ "refId": "E",
+ "step": 240
+ },
+ {
+ "expr": "sum by (mode)(irate(node_cpu_seconds_total{mode='irq',instance=\"$node\",job=\"$job\"}[5m])) * 100",
+ "format": "time_series",
+ "intervalFactor": 2,
+ "legendFormat": "Irq - Servicing interrupts",
+ "refId": "F",
+ "step": 240
+ },
+ {
+ "expr": "sum by (mode)(irate(node_cpu_seconds_total{mode='softirq',instance=\"$node\",job=\"$job\"}[5m])) * 100",
+ "format": "time_series",
+ "intervalFactor": 2,
+ "legendFormat": "Softirq - Servicing softirqs",
+ "refId": "G",
+ "step": 240
+ },
+ {
+ "expr": "sum by (mode)(irate(node_cpu_seconds_total{mode='steal',instance=\"$node\",job=\"$job\"}[5m])) * 100",
+ "format": "time_series",
+ "intervalFactor": 2,
+ "legendFormat": "Steal - Time spent in other operating systems when running in a virtualized environment",
+ "refId": "H",
+ "step": 240
+ },
+ {
+ "expr": "sum by (mode)(irate(node_cpu_seconds_total{mode='guest',instance=\"$node\",job=\"$job\"}[5m])) * 100",
+ "format": "time_series",
+ "intervalFactor": 2,
+ "legendFormat": "Guest - Time spent running a virtual CPU for a guest operating system",
+ "refId": "I",
+ "step": 240
+ }
+ ],
+ "thresholds": [],
+ "timeFrom": null,
+ "timeRegions": [],
+ "timeShift": null,
+ "title": "CPU",
+ "tooltip": {
+ "shared": true,
+ "sort": 0,
+ "value_type": "individual"
+ },
+ "type": "graph",
+ "xaxis": {
+ "buckets": null,
+ "mode": "time",
+ "name": null,
+ "show": true,
+ "values": []
+ },
+ "yaxes": [
+ {
+ "format": "short",
+ "label": "percentage",
+ "logBase": 1,
+ "max": "100",
+ "min": "0",
+ "show": true
+ },
+ {
+ "format": "short",
+ "label": null,
+ "logBase": 1,
+ "max": null,
+ "min": null,
+ "show": false
+ }
+ ],
+ "yaxis": {
+ "align": false,
+ "alignLevel": null
+ }
+ },
+ {
+ "aliasColors": {
+ "Apps": "#629E51",
+ "Buffers": "#614D93",
+ "Cache": "#6D1F62",
+ "Cached": "#511749",
+ "Committed": "#508642",
+ "Free": "#0A437C",
+ "Hardware Corrupted - Amount of RAM that the kernel identified as corrupted / not working": "#CFFAFF",
+ "Inactive": "#584477",
+ "PageTables": "#0A50A1",
+ "Page_Tables": "#0A50A1",
+ "RAM_Free": "#E0F9D7",
+ "Slab": "#806EB7",
+ "Slab_Cache": "#E0752D",
+ "Swap": "#BF1B00",
+ "Swap - Swap memory usage": "#BF1B00",
+ "Swap_Cache": "#C15C17",
+ "Swap_Free": "#2F575E",
+ "Unused": "#EAB839",
+ "Unused - Free memory unassigned": "#052B51"
+ },
+ "bars": false,
+ "dashLength": 10,
+ "dashes": false,
+ "datasource": "${DS_PROMETHEUS}",
+ "decimals": 2,
+ "description": "",
+ "fill": 4,
+ "fillGradient": 0,
+ "gridPos": {
+ "h": 12,
+ "w": 12,
+ "x": 12,
+ "y": 21
+ },
+ "hiddenSeries": false,
+ "id": 24,
+ "legend": {
+ "alignAsTable": true,
+ "avg": true,
+ "current": true,
+ "max": true,
+ "min": true,
+ "rightSide": false,
+ "show": true,
+ "sideWidth": 350,
+ "sort": null,
+ "sortDesc": null,
+ "total": false,
+ "values": true
+ },
+ "lines": true,
+ "linewidth": 1,
+ "links": [],
+ "maxPerRow": 6,
+ "nullPointMode": "null",
+ "options": {
+ "dataLinks": []
+ },
+ "percentage": false,
+ "pointradius": 5,
+ "points": false,
+ "renderer": "flot",
+ "seriesOverrides": [
+ {
+ "alias": "/.*Hardware Corrupted - *./",
+ "stack": false
+ }
+ ],
+ "spaceLength": 10,
+ "stack": true,
+ "steppedLine": false,
+ "targets": [
+ {
+ "expr": "node_memory_MemTotal_bytes{instance=\"$node\",job=\"$job\"} - node_memory_MemFree_bytes{instance=\"$node\",job=\"$job\"} - node_memory_Buffers_bytes{instance=\"$node\",job=\"$job\"} - node_memory_Cached_bytes{instance=\"$node\",job=\"$job\"} - node_memory_Slab_bytes{instance=\"$node\",job=\"$job\"} - node_memory_PageTables_bytes{instance=\"$node\",job=\"$job\"} - node_memory_SwapCached_bytes{instance=\"$node\",job=\"$job\"}",
+ "format": "time_series",
+ "hide": false,
+ "intervalFactor": 2,
+ "legendFormat": "Apps - Memory used by user-space applications",
+ "refId": "A",
+ "step": 240
+ },
+ {
+ "expr": "node_memory_PageTables_bytes{instance=\"$node\",job=\"$job\"}",
+ "format": "time_series",
+ "hide": false,
+ "intervalFactor": 2,
+ "legendFormat": "PageTables - Memory used to map between virtual and physical memory addresses",
+ "refId": "B",
+ "step": 240
+ },
+ {
+ "expr": "node_memory_SwapCached_bytes{instance=\"$node\",job=\"$job\"}",
+ "format": "time_series",
+ "intervalFactor": 2,
+ "legendFormat": "SwapCache - Memory that keeps track of pages that have been fetched from swap but not yet been modified",
+ "refId": "C",
+ "step": 240
+ },
+ {
+ "expr": "node_memory_Slab_bytes{instance=\"$node\",job=\"$job\"}",
+ "format": "time_series",
+ "hide": false,
+ "intervalFactor": 2,
+ "legendFormat": "Slab - Memory used by the kernel to cache data structures for its own use (caches like inode, dentry, etc)",
+ "refId": "D",
+ "step": 240
+ },
+ {
+ "expr": "node_memory_Cached_bytes{instance=\"$node\",job=\"$job\"}",
+ "format": "time_series",
+ "hide": false,
+ "intervalFactor": 2,
+ "legendFormat": "Cache - Parked file data (file content) cache",
+ "refId": "E",
+ "step": 240
+ },
+ {
+ "expr": "node_memory_Buffers_bytes{instance=\"$node\",job=\"$job\"}",
+ "format": "time_series",
+ "hide": false,
+ "intervalFactor": 2,
+ "legendFormat": "Buffers - Block device (e.g. harddisk) cache",
+ "refId": "F",
+ "step": 240
+ },
+ {
+ "expr": "node_memory_MemFree_bytes{instance=\"$node\",job=\"$job\"}",
+ "format": "time_series",
+ "hide": false,
+ "intervalFactor": 2,
+ "legendFormat": "Unused - Free memory unassigned",
+ "refId": "G",
+ "step": 240
+ },
+ {
+ "expr": "(node_memory_SwapTotal_bytes{instance=\"$node\",job=\"$job\"} - node_memory_SwapFree_bytes{instance=\"$node\",job=\"$job\"})",
+ "format": "time_series",
+ "hide": false,
+ "intervalFactor": 2,
+ "legendFormat": "Swap - Swap space used",
+ "refId": "H",
+ "step": 240
+ },
+ {
+ "expr": "node_memory_HardwareCorrupted_bytes{instance=\"$node\",job=\"$job\"}",
+ "format": "time_series",
+ "hide": false,
+ "intervalFactor": 2,
+ "legendFormat": "Hardware Corrupted - Amount of RAM that the kernel identified as corrupted / not working",
+ "refId": "I",
+ "step": 240
+ }
+ ],
+ "thresholds": [],
+ "timeFrom": null,
+ "timeRegions": [],
+ "timeShift": null,
+ "title": "Memory Stack",
+ "tooltip": {
+ "shared": true,
+ "sort": 0,
+ "value_type": "individual"
+ },
+ "type": "graph",
+ "xaxis": {
+ "buckets": null,
+ "mode": "time",
+ "name": null,
+ "show": true,
+ "values": []
+ },
+ "yaxes": [
+ {
+ "format": "bytes",
+ "label": "bytes",
+ "logBase": 1,
+ "max": null,
+ "min": "0",
+ "show": true
+ },
+ {
+ "format": "short",
+ "label": null,
+ "logBase": 1,
+ "max": null,
+ "min": null,
+ "show": false
+ }
+ ],
+ "yaxis": {
+ "align": false,
+ "alignLevel": null
+ }
+ },
+ {
+ "aliasColors": {
+ "receive_packets_eth0": "#7EB26D",
+ "receive_packets_lo": "#E24D42",
+ "transmit_packets_eth0": "#7EB26D",
+ "transmit_packets_lo": "#E24D42"
+ },
+ "bars": false,
+ "dashLength": 10,
+ "dashes": false,
+ "datasource": "${DS_PROMETHEUS}",
+ "fill": 4,
+ "fillGradient": 0,
+ "gridPos": {
+ "h": 12,
+ "w": 12,
+ "x": 0,
+ "y": 33
+ },
+ "hiddenSeries": false,
+ "id": 84,
+ "legend": {
+ "alignAsTable": true,
+ "avg": true,
+ "current": true,
+ "max": true,
+ "min": true,
+ "rightSide": false,
+ "show": true,
+ "total": false,
+ "values": true
+ },
+ "lines": true,
+ "linewidth": 1,
+ "links": [],
+ "nullPointMode": "null",
+ "options": {
+ "dataLinks": []
+ },
+ "percentage": false,
+ "pointradius": 5,
+ "points": false,
+ "renderer": "flot",
+ "seriesOverrides": [
+ {
+ "$$hashKey": "object:5871",
+ "alias": "/.*Trans.*/",
+ "transform": "negative-Y"
+ }
+ ],
+ "spaceLength": 10,
+ "stack": false,
+ "steppedLine": false,
+ "targets": [
+ {
+ "expr": "irate(node_network_receive_bytes_total{instance=\"$node\",job=\"$job\"}[5m])*8",
+ "format": "time_series",
+ "intervalFactor": 2,
+ "legendFormat": "{{device}} - Receive",
+ "refId": "A",
+ "step": 240
+ },
+ {
+ "expr": "irate(node_network_transmit_bytes_total{instance=\"$node\",job=\"$job\"}[5m])*8",
+ "format": "time_series",
+ "intervalFactor": 2,
+ "legendFormat": "{{device}} - Transmit",
+ "refId": "B",
+ "step": 240
+ }
+ ],
+ "thresholds": [],
+ "timeFrom": null,
+ "timeRegions": [],
+ "timeShift": null,
+ "title": "Network Traffic",
+ "tooltip": {
+ "shared": true,
+ "sort": 0,
+ "value_type": "individual"
+ },
+ "type": "graph",
+ "xaxis": {
+ "buckets": null,
+ "mode": "time",
+ "name": null,
+ "show": true,
+ "values": []
+ },
+ "yaxes": [
+ {
+ "$$hashKey": "object:5884",
+ "format": "bps",
+ "label": "bits out (-) / in (+)",
+ "logBase": 1,
+ "max": null,
+ "min": null,
+ "show": true
+ },
+ {
+ "$$hashKey": "object:5885",
+ "format": "short",
+ "label": null,
+ "logBase": 1,
+ "max": null,
+ "min": null,
+ "show": false
+ }
+ ],
+ "yaxis": {
+ "align": false,
+ "alignLevel": null
+ }
+ },
+ {
+ "aliasColors": {},
+ "bars": false,
+ "dashLength": 10,
+ "dashes": false,
+ "datasource": "${DS_PROMETHEUS}",
+ "decimals": 3,
+ "description": "",
+ "fill": 4,
+ "fillGradient": 0,
+ "gridPos": {
+ "h": 12,
+ "w": 12,
+ "x": 12,
+ "y": 33
+ },
+ "height": "",
+ "hiddenSeries": false,
+ "id": 156,
+ "legend": {
+ "alignAsTable": true,
+ "avg": true,
+ "current": true,
+ "max": true,
+ "min": true,
+ "rightSide": false,
+ "show": true,
+ "sort": "current",
+ "sortDesc": false,
+ "total": false,
+ "values": true
+ },
+ "lines": true,
+ "linewidth": 1,
+ "links": [],
+ "maxPerRow": 6,
+ "nullPointMode": "null",
+ "options": {
+ "dataLinks": []
+ },
+ "percentage": false,
+ "pointradius": 5,
+ "points": false,
+ "renderer": "flot",
+ "seriesOverrides": [],
+ "spaceLength": 10,
+ "stack": false,
+ "steppedLine": false,
+ "targets": [
+ {
+ "expr": "node_filesystem_size_bytes{instance=\"$node\",job=\"$job\",device!~'rootfs'} - node_filesystem_avail_bytes{instance=\"$node\",job=\"$job\",device!~'rootfs'}",
+ "format": "time_series",
+ "intervalFactor": 2,
+ "legendFormat": "{{mountpoint}}",
+ "refId": "A",
+ "step": 240
+ }
+ ],
+ "thresholds": [],
+ "timeFrom": null,
+ "timeRegions": [],
+ "timeShift": null,
+ "title": "Disk Space Used",
+ "tooltip": {
+ "shared": true,
+ "sort": 0,
+ "value_type": "individual"
+ },
+ "type": "graph",
+ "xaxis": {
+ "buckets": null,
+ "mode": "time",
+ "name": null,
+ "show": true,
+ "values": []
+ },
+ "yaxes": [
+ {
+ "format": "bytes",
+ "label": "bytes",
+ "logBase": 1,
+ "max": null,
+ "min": "0",
+ "show": true
+ },
+ {
+ "format": "short",
+ "label": null,
+ "logBase": 1,
+ "max": null,
+ "min": null,
+ "show": false
+ }
+ ],
+ "yaxis": {
+ "align": false,
+ "alignLevel": null
+ }
+ },
+ {
+ "aliasColors": {},
+ "bars": false,
+ "dashLength": 10,
+ "dashes": false,
+ "datasource": "${DS_PROMETHEUS}",
+ "description": "",
+ "fill": 2,
+ "fillGradient": 0,
+ "gridPos": {
+ "h": 12,
+ "w": 12,
+ "x": 0,
+ "y": 45
+ },
+ "hiddenSeries": false,
+ "id": 229,
+ "legend": {
+ "alignAsTable": true,
+ "avg": true,
+ "current": true,
+ "hideZero": true,
+ "max": true,
+ "min": true,
+ "rightSide": false,
+ "show": true,
+ "total": false,
+ "values": true
+ },
+ "lines": true,
+ "linewidth": 1,
+ "links": [],
+ "maxPerRow": 6,
+ "nullPointMode": "null",
+ "options": {
+ "dataLinks": []
+ },
+ "percentage": false,
+ "pointradius": 5,
+ "points": false,
+ "renderer": "flot",
+ "seriesOverrides": [
+ {
+ "alias": "/.*Read.*/",
+ "transform": "negative-Y"
+ },
+ {
+ "alias": "/.*sda_.*/",
+ "color": "#7EB26D"
+ },
+ {
+ "alias": "/.*sdb_.*/",
+ "color": "#EAB839"
+ },
+ {
+ "alias": "/.*sdc_.*/",
+ "color": "#6ED0E0"
+ },
+ {
+ "alias": "/.*sdd_.*/",
+ "color": "#EF843C"
+ },
+ {
+ "alias": "/.*sde_.*/",
+ "color": "#E24D42"
+ },
+ {
+ "alias": "/.*sda1.*/",
+ "color": "#584477"
+ },
+ {
+ "alias": "/.*sda2_.*/",
+ "color": "#BA43A9"
+ },
+ {
+ "alias": "/.*sda3_.*/",
+ "color": "#F4D598"
+ },
+ {
+ "alias": "/.*sdb1.*/",
+ "color": "#0A50A1"
+ },
+ {
+ "alias": "/.*sdb2.*/",
+ "color": "#BF1B00"
+ },
+ {
+ "alias": "/.*sdb2.*/",
+ "color": "#BF1B00"
+ },
+ {
+ "alias": "/.*sdb3.*/",
+ "color": "#E0752D"
+ },
+ {
+ "alias": "/.*sdc1.*/",
+ "color": "#962D82"
+ },
+ {
+ "alias": "/.*sdc2.*/",
+ "color": "#614D93"
+ },
+ {
+ "alias": "/.*sdc3.*/",
+ "color": "#9AC48A"
+ },
+ {
+ "alias": "/.*sdd1.*/",
+ "color": "#65C5DB"
+ },
+ {
+ "alias": "/.*sdd2.*/",
+ "color": "#F9934E"
+ },
+ {
+ "alias": "/.*sdd3.*/",
+ "color": "#EA6460"
+ },
+ {
+ "alias": "/.*sde1.*/",
+ "color": "#E0F9D7"
+ },
+ {
+ "alias": "/.*sdd2.*/",
+ "color": "#FCEACA"
+ },
+ {
+ "alias": "/.*sde3.*/",
+ "color": "#F9E2D2"
+ }
+ ],
+ "spaceLength": 10,
+ "stack": false,
+ "steppedLine": false,
+ "targets": [
+ {
+ "expr": "irate(node_disk_reads_completed_total{instance=\"$node\",job=\"$job\",device=~\"$diskdevices\"}[5m])",
+ "intervalFactor": 4,
+ "legendFormat": "{{device}} - Reads completed",
+ "refId": "A",
+ "step": 480
+ },
+ {
+ "expr": "irate(node_disk_writes_completed_total{instance=\"$node\",job=\"$job\",device=~\"$diskdevices\"}[5m])",
+ "intervalFactor": 2,
+ "legendFormat": "{{device}} - Writes completed",
+ "refId": "B",
+ "step": 240
+ }
+ ],
+ "thresholds": [],
+ "timeFrom": null,
+ "timeRegions": [],
+ "timeShift": null,
+ "title": "Disk IOps",
+ "tooltip": {
+ "shared": false,
+ "sort": 0,
+ "value_type": "individual"
+ },
+ "type": "graph",
+ "xaxis": {
+ "buckets": null,
+ "mode": "time",
+ "name": null,
+ "show": true,
+ "values": []
+ },
+ "yaxes": [
+ {
+ "format": "iops",
+ "label": "IO read (-) / write (+)",
+ "logBase": 1,
+ "max": null,
+ "min": null,
+ "show": true
+ },
+ {
+ "format": "short",
+ "label": null,
+ "logBase": 1,
+ "max": null,
+ "min": null,
+ "show": false
+ }
+ ],
+ "yaxis": {
+ "align": false,
+ "alignLevel": null
+ }
+ },
+ {
+ "aliasColors": {
+ "io time": "#890F02"
+ },
+ "bars": false,
+ "dashLength": 10,
+ "dashes": false,
+ "datasource": "${DS_PROMETHEUS}",
+ "decimals": 3,
+ "description": "",
+ "fill": 4,
+ "fillGradient": 0,
+ "gridPos": {
+ "h": 12,
+ "w": 12,
+ "x": 12,
+ "y": 45
+ },
+ "hiddenSeries": false,
+ "id": 42,
+ "legend": {
+ "alignAsTable": true,
+ "avg": true,
+ "current": true,
+ "max": true,
+ "min": true,
+ "rightSide": false,
+ "show": true,
+ "sort": null,
+ "sortDesc": null,
+ "total": false,
+ "values": true
+ },
+ "lines": true,
+ "linewidth": 1,
+ "links": [],
+ "maxPerRow": 6,
+ "nullPointMode": "null",
+ "options": {
+ "dataLinks": []
+ },
+ "percentage": false,
+ "pointradius": 5,
+ "points": false,
+ "renderer": "flot",
+ "seriesOverrides": [
+ {
+ "alias": "/.*read*./",
+ "transform": "negative-Y"
+ },
+ {
+ "alias": "/.*sda.*/",
+ "color": "#7EB26D"
+ },
+ {
+ "alias": "/.*sdb.*/",
+ "color": "#EAB839"
+ },
+ {
+ "alias": "/.*sdc.*/",
+ "color": "#6ED0E0"
+ },
+ {
+ "alias": "/.*sdd.*/",
+ "color": "#EF843C"
+ },
+ {
+ "alias": "/.*sde.*/",
+ "color": "#E24D42"
+ }
+ ],
+ "spaceLength": 10,
+ "stack": false,
+ "steppedLine": false,
+ "targets": [
+ {
+ "expr": "irate(node_disk_read_bytes_total{instance=\"$node\",job=\"$job\",device=~\"$diskdevices\"}[5m])",
+ "format": "time_series",
+ "hide": false,
+ "intervalFactor": 2,
+ "legendFormat": "{{device}} - Successfully read bytes",
+ "refId": "A",
+ "step": 240
+ },
+ {
+ "expr": "irate(node_disk_written_bytes_total{instance=\"$node\",job=\"$job\",device=~\"$diskdevices\"}[5m])",
+ "format": "time_series",
+ "hide": false,
+ "intervalFactor": 2,
+ "legendFormat": "{{device}} - Successfully written bytes",
+ "refId": "B",
+ "step": 240
+ }
+ ],
+ "thresholds": [],
+ "timeFrom": null,
+ "timeRegions": [],
+ "timeShift": null,
+ "title": "I/O Usage Read / Write",
+ "tooltip": {
+ "shared": true,
+ "sort": 0,
+ "value_type": "individual"
+ },
+ "type": "graph",
+ "xaxis": {
+ "buckets": null,
+ "mode": "time",
+ "name": null,
+ "show": false,
+ "values": []
+ },
+ "yaxes": [
+ {
+ "format": "bytes",
+ "label": "bytes read (-) / write (+)",
+ "logBase": 1,
+ "max": null,
+ "min": null,
+ "show": true
+ },
+ {
+ "format": "ms",
+ "label": "",
+ "logBase": 1,
+ "max": null,
+ "min": null,
+ "show": true
+ }
+ ],
+ "yaxis": {
+ "align": false,
+ "alignLevel": null
+ }
+ },
+ {
+ "aliasColors": {
+ "io time": "#890F02"
+ },
+ "bars": false,
+ "dashLength": 10,
+ "dashes": false,
+ "datasource": "${DS_PROMETHEUS}",
+ "decimals": 3,
+ "description": "",
+ "fill": 4,
+ "fillGradient": 0,
+ "gridPos": {
+ "h": 12,
+ "w": 12,
+ "x": 0,
+ "y": 57
+ },
+ "hiddenSeries": false,
+ "id": 127,
+ "legend": {
+ "alignAsTable": true,
+ "avg": true,
+ "current": true,
+ "max": true,
+ "min": true,
+ "rightSide": false,
+ "show": true,
+ "sort": null,
+ "sortDesc": null,
+ "total": false,
+ "values": true
+ },
+ "lines": true,
+ "linewidth": 1,
+ "links": [],
+ "maxPerRow": 6,
+ "nullPointMode": "null",
+ "options": {
+ "dataLinks": []
+ },
+ "percentage": false,
+ "pointradius": 5,
+ "points": false,
+ "renderer": "flot",
+ "seriesOverrides": [],
+ "spaceLength": 10,
+ "stack": false,
+ "steppedLine": false,
+ "targets": [
+ {
+ "expr": "irate(node_disk_io_time_seconds_total{instance=\"$node\",job=\"$job\",device=~\"$diskdevices\"} [5m])",
+ "format": "time_series",
+ "hide": false,
+ "intervalFactor": 2,
+ "legendFormat": "{{device}} - Time spent doing I/Os",
+ "refId": "A",
+ "step": 240
+ }
+ ],
+ "thresholds": [],
+ "timeFrom": null,
+ "timeRegions": [],
+ "timeShift": null,
+ "title": "I/O Usage Times",
+ "tooltip": {
+ "shared": true,
+ "sort": 0,
+ "value_type": "individual"
+ },
+ "type": "graph",
+ "xaxis": {
+ "buckets": null,
+ "mode": "time",
+ "name": null,
+ "show": false,
+ "values": []
+ },
+ "yaxes": [
+ {
+ "format": "s",
+ "label": "time",
+ "logBase": 1,
+ "max": null,
+ "min": "0",
+ "show": true
+ },
+ {
+ "format": "s",
+ "label": "",
+ "logBase": 1,
+ "max": null,
+ "min": null,
+ "show": false
+ }
+ ],
+ "yaxis": {
+ "align": false,
+ "alignLevel": null
+ }
+ }
+ ],
+ "repeat": null,
+ "title": "CPU / Memory / Net / Disk",
+ "type": "row"
+ },
+ {
+ "collapsed": true,
+ "datasource": "${DS_PROMETHEUS}",
+ "gridPos": {
+ "h": 1,
+ "w": 24,
+ "x": 0,
+ "y": 21
+ },
+ "id": 266,
+ "panels": [
+ {
+ "aliasColors": {
+ "Apps": "#629E51",
+ "Buffers": "#614D93",
+ "Cache": "#6D1F62",
+ "Cached": "#511749",
+ "Committed": "#508642",
+ "Free": "#0A437C",
+ "Hardware Corrupted - Amount of RAM that the kernel identified as corrupted / not working": "#CFFAFF",
+ "Inactive": "#584477",
+ "PageTables": "#0A50A1",
+ "Page_Tables": "#0A50A1",
+ "RAM_Free": "#E0F9D7",
+ "Slab": "#806EB7",
+ "Slab_Cache": "#E0752D",
+ "Swap": "#BF1B00",
+ "Swap_Cache": "#C15C17",
+ "Swap_Free": "#2F575E",
+ "Unused": "#EAB839"
+ },
+ "bars": false,
+ "dashLength": 10,
+ "dashes": false,
+ "datasource": "${DS_PROMETHEUS}",
+ "decimals": 2,
+ "fill": 2,
+ "fillGradient": 0,
+ "gridPos": {
+ "h": 10,
+ "w": 12,
+ "x": 0,
+ "y": 70
+ },
+ "hiddenSeries": false,
+ "id": 136,
+ "legend": {
+ "alignAsTable": true,
+ "avg": true,
+ "current": true,
+ "max": true,
+ "min": true,
+ "rightSide": false,
+ "show": true,
+ "sideWidth": 350,
+ "total": false,
+ "values": true
+ },
+ "lines": true,
+ "linewidth": 1,
+ "links": [],
+ "maxPerRow": 2,
+ "nullPointMode": "null",
+ "options": {
+ "dataLinks": []
+ },
+ "percentage": false,
+ "pointradius": 5,
+ "points": false,
+ "renderer": "flot",
+ "seriesOverrides": [],
+ "spaceLength": 10,
+ "stack": true,
+ "steppedLine": false,
+ "targets": [
+ {
+ "expr": "node_memory_Inactive_bytes{instance=\"$node\",job=\"$job\"}",
+ "format": "time_series",
+ "intervalFactor": 2,
+ "legendFormat": "Inactive - Memory which has been less recently used. It is more eligible to be reclaimed for other purposes",
+ "refId": "A",
+ "step": 4
+ },
+ {
+ "expr": "node_memory_Active_bytes{instance=\"$node\",job=\"$job\"}",
+ "format": "time_series",
+ "intervalFactor": 2,
+ "legendFormat": "Active - Memory that has been used more recently and usually not reclaimed unless absolutely necessary",
+ "refId": "B",
+ "step": 4
+ }
+ ],
+ "thresholds": [],
+ "timeFrom": null,
+ "timeRegions": [],
+ "timeShift": null,
+ "title": "Memory Active / Inactive",
+ "tooltip": {
+ "shared": true,
+ "sort": 0,
+ "value_type": "cumulative"
+ },
+ "type": "graph",
+ "xaxis": {
+ "buckets": null,
+ "mode": "time",
+ "name": null,
+ "show": true,
+ "values": []
+ },
+ "yaxes": [
+ {
+ "format": "bytes",
+ "label": "bytes",
+ "logBase": 1,
+ "max": null,
+ "min": "0",
+ "show": true
+ },
+ {
+ "format": "short",
+ "label": null,
+ "logBase": 1,
+ "max": null,
+ "min": null,
+ "show": false
+ }
+ ],
+ "yaxis": {
+ "align": false,
+ "alignLevel": null
+ }
+ },
+ {
+ "aliasColors": {
+ "Apps": "#629E51",
+ "Buffers": "#614D93",
+ "Cache": "#6D1F62",
+ "Cached": "#511749",
+ "Committed": "#508642",
+ "Free": "#0A437C",
+ "Hardware Corrupted - Amount of RAM that the kernel identified as corrupted / not working": "#CFFAFF",
+ "Inactive": "#584477",
+ "PageTables": "#0A50A1",
+ "Page_Tables": "#0A50A1",
+ "RAM_Free": "#E0F9D7",
+ "Slab": "#806EB7",
+ "Slab_Cache": "#E0752D",
+ "Swap": "#BF1B00",
+ "Swap_Cache": "#C15C17",
+ "Swap_Free": "#2F575E",
+ "Unused": "#EAB839"
+ },
+ "bars": false,
+ "dashLength": 10,
+ "dashes": false,
+ "datasource": "${DS_PROMETHEUS}",
+ "decimals": 2,
+ "fill": 2,
+ "fillGradient": 0,
+ "gridPos": {
+ "h": 10,
+ "w": 12,
+ "x": 12,
+ "y": 70
+ },
+ "hiddenSeries": false,
+ "id": 135,
+ "legend": {
+ "alignAsTable": true,
+ "avg": true,
+ "current": true,
+ "max": true,
+ "min": true,
+ "rightSide": false,
+ "show": true,
+ "sideWidth": 350,
+ "total": false,
+ "values": true
+ },
+ "lines": true,
+ "linewidth": 1,
+ "links": [],
+ "maxPerRow": 6,
+ "nullPointMode": "null",
+ "options": {
+ "dataLinks": []
+ },
+ "percentage": false,
+ "pointradius": 5,
+ "points": false,
+ "renderer": "flot",
+ "seriesOverrides": [
+ {
+ "alias": "/.*Committed_AS - *./"
+ },
+ {
+ "alias": "/.*CommitLimit - *./",
+ "color": "#BF1B00",
+ "fill": 0
+ }
+ ],
+ "spaceLength": 10,
+ "stack": false,
+ "steppedLine": false,
+ "targets": [
+ {
+ "expr": "node_memory_Committed_AS_bytes{instance=\"$node\",job=\"$job\"}",
+ "format": "time_series",
+ "intervalFactor": 2,
+ "legendFormat": "Committed_AS - Amount of memory presently allocated on the system",
+ "refId": "A",
+ "step": 4
+ },
+ {
+ "expr": "node_memory_CommitLimit_bytes{instance=\"$node\",job=\"$job\"}",
+ "format": "time_series",
+ "intervalFactor": 2,
+ "legendFormat": "CommitLimit - Amount of memory currently available to be allocated on the system",
+ "refId": "B",
+ "step": 4
+ }
+ ],
+ "thresholds": [],
+ "timeFrom": null,
+ "timeRegions": [],
+ "timeShift": null,
+ "title": "Memory Commited",
+ "tooltip": {
+ "shared": true,
+ "sort": 0,
+ "value_type": "cumulative"
+ },
+ "type": "graph",
+ "xaxis": {
+ "buckets": null,
+ "mode": "time",
+ "name": null,
+ "show": true,
+ "values": []
+ },
+ "yaxes": [
+ {
+ "format": "bytes",
+ "label": "bytes",
+ "logBase": 1,
+ "max": null,
+ "min": "0",
+ "show": true
+ },
+ {
+ "format": "short",
+ "label": null,
+ "logBase": 1,
+ "max": null,
+ "min": null,
+ "show": false
+ }
+ ],
+ "yaxis": {
+ "align": false,
+ "alignLevel": null
+ }
+ },
+ {
+ "aliasColors": {
+ "Apps": "#629E51",
+ "Buffers": "#614D93",
+ "Cache": "#6D1F62",
+ "Cached": "#511749",
+ "Committed": "#508642",
+ "Free": "#0A437C",
+ "Hardware Corrupted - Amount of RAM that the kernel identified as corrupted / not working": "#CFFAFF",
+ "Inactive": "#584477",
+ "PageTables": "#0A50A1",
+ "Page_Tables": "#0A50A1",
+ "RAM_Free": "#E0F9D7",
+ "Slab": "#806EB7",
+ "Slab_Cache": "#E0752D",
+ "Swap": "#BF1B00",
+ "Swap_Cache": "#C15C17",
+ "Swap_Free": "#2F575E",
+ "Unused": "#EAB839"
+ },
+ "bars": false,
+ "dashLength": 10,
+ "dashes": false,
+ "datasource": "${DS_PROMETHEUS}",
+ "decimals": 2,
+ "fill": 2,
+ "fillGradient": 0,
+ "gridPos": {
+ "h": 10,
+ "w": 12,
+ "x": 0,
+ "y": 80
+ },
+ "hiddenSeries": false,
+ "id": 191,
+ "legend": {
+ "alignAsTable": true,
+ "avg": true,
+ "current": true,
+ "max": true,
+ "min": true,
+ "rightSide": false,
+ "show": true,
+ "sideWidth": 350,
+ "total": false,
+ "values": true
+ },
+ "lines": true,
+ "linewidth": 1,
+ "links": [],
+ "maxPerRow": 6,
+ "nullPointMode": "null",
+ "options": {
+ "dataLinks": []
+ },
+ "percentage": false,
+ "pointradius": 5,
+ "points": false,
+ "renderer": "flot",
+ "seriesOverrides": [],
+ "spaceLength": 10,
+ "stack": true,
+ "steppedLine": false,
+ "targets": [
+ {
+ "expr": "node_memory_Inactive_file_bytes{instance=\"$node\",job=\"$job\"}",
+ "format": "time_series",
+ "hide": false,
+ "intervalFactor": 2,
+ "legendFormat": "Inactive_file - File-backed memory on inactive LRU list",
+ "refId": "A",
+ "step": 4
+ },
+ {
+ "expr": "node_memory_Inactive_anon_bytes{instance=\"$node\",job=\"$job\"}",
+ "format": "time_series",
+ "hide": false,
+ "intervalFactor": 2,
+ "legendFormat": "Inactive_anon - Anonymous and swap cache on inactive LRU list, including tmpfs (shmem)",
+ "refId": "B",
+ "step": 4
+ },
+ {
+ "expr": "node_memory_Active_file_bytes{instance=\"$node\",job=\"$job\"}",
+ "format": "time_series",
+ "hide": false,
+ "intervalFactor": 2,
+ "legendFormat": "Active_file - File-backed memory on active LRU list",
+ "refId": "C",
+ "step": 4
+ },
+ {
+ "expr": "node_memory_Active_anon_bytes{instance=\"$node\",job=\"$job\"}",
+ "format": "time_series",
+ "hide": false,
+ "intervalFactor": 2,
+ "legendFormat": "Active_anon - Anonymous and swap cache on active least-recently-used (LRU) list, including tmpfs",
+ "refId": "D",
+ "step": 4
+ }
+ ],
+ "thresholds": [],
+ "timeFrom": null,
+ "timeRegions": [],
+ "timeShift": null,
+ "title": "Memory Active / Inactive Detail",
+ "tooltip": {
+ "shared": true,
+ "sort": 0,
+ "value_type": "cumulative"
+ },
+ "type": "graph",
+ "xaxis": {
+ "buckets": null,
+ "mode": "time",
+ "name": null,
+ "show": true,
+ "values": []
+ },
+ "yaxes": [
+ {
+ "format": "bytes",
+ "label": "bytes",
+ "logBase": 1,
+ "max": null,
+ "min": "0",
+ "show": true
+ },
+ {
+ "format": "bytes",
+ "label": null,
+ "logBase": 1,
+ "max": null,
+ "min": null,
+ "show": true
+ }
+ ],
+ "yaxis": {
+ "align": false,
+ "alignLevel": null
+ }
+ },
+ {
+ "aliasColors": {
+ "Active": "#99440A",
+ "Buffers": "#58140C",
+ "Cache": "#6D1F62",
+ "Cached": "#511749",
+ "Committed": "#508642",
+ "Dirty": "#6ED0E0",
+ "Free": "#B7DBAB",
+ "Inactive": "#EA6460",
+ "Mapped": "#052B51",
+ "PageTables": "#0A50A1",
+ "Page_Tables": "#0A50A1",
+ "Slab_Cache": "#EAB839",
+ "Swap": "#BF1B00",
+ "Swap_Cache": "#C15C17",
+ "Total": "#511749",
+ "Total RAM": "#052B51",
+ "Total RAM + Swap": "#052B51",
+ "Total Swap": "#614D93",
+ "VmallocUsed": "#EA6460"
+ },
+ "bars": false,
+ "dashLength": 10,
+ "dashes": false,
+ "datasource": "${DS_PROMETHEUS}",
+ "decimals": 2,
+ "fill": 2,
+ "fillGradient": 0,
+ "gridPos": {
+ "h": 10,
+ "w": 12,
+ "x": 12,
+ "y": 80
+ },
+ "hiddenSeries": false,
+ "id": 130,
+ "legend": {
+ "alignAsTable": true,
+ "avg": true,
+ "current": true,
+ "max": true,
+ "min": true,
+ "rightSide": false,
+ "show": true,
+ "sideWidth": null,
+ "total": false,
+ "values": true
+ },
+ "lines": true,
+ "linewidth": 1,
+ "links": [],
+ "maxPerRow": 2,
+ "nullPointMode": "null",
+ "options": {
+ "dataLinks": []
+ },
+ "percentage": false,
+ "pointradius": 5,
+ "points": false,
+ "renderer": "flot",
+ "seriesOverrides": [],
+ "spaceLength": 10,
+ "stack": false,
+ "steppedLine": false,
+ "targets": [
+ {
+ "expr": "node_memory_Writeback_bytes{instance=\"$node\",job=\"$job\"}",
+ "format": "time_series",
+ "intervalFactor": 2,
+ "legendFormat": "Writeback - Memory which is actively being written back to disk",
+ "refId": "A",
+ "step": 4
+ },
+ {
+ "expr": "node_memory_WritebackTmp_bytes{instance=\"$node\",job=\"$job\"}",
+ "format": "time_series",
+ "intervalFactor": 2,
+ "legendFormat": "WritebackTmp - Memory used by FUSE for temporary writeback buffers",
+ "refId": "B",
+ "step": 4
+ },
+ {
+ "expr": "node_memory_Dirty_bytes{instance=\"$node\",job=\"$job\"}",
+ "format": "time_series",
+ "intervalFactor": 2,
+ "legendFormat": "Dirty - Memory which is waiting to get written back to the disk",
+ "refId": "C",
+ "step": 4
+ }
+ ],
+ "thresholds": [],
+ "timeFrom": null,
+ "timeRegions": [],
+ "timeShift": null,
+ "title": "Memory Writeback and Dirty",
+ "tooltip": {
+ "shared": true,
+ "sort": 0,
+ "value_type": "individual"
+ },
+ "type": "graph",
+ "xaxis": {
+ "buckets": null,
+ "mode": "time",
+ "name": null,
+ "show": true,
+ "values": []
+ },
+ "yaxes": [
+ {
+ "format": "bytes",
+ "label": "bytes",
+ "logBase": 1,
+ "max": null,
+ "min": "0",
+ "show": true
+ },
+ {
+ "format": "short",
+ "label": null,
+ "logBase": 1,
+ "max": null,
+ "min": null,
+ "show": false
+ }
+ ],
+ "yaxis": {
+ "align": false,
+ "alignLevel": null
+ }
+ },
+ {
+ "aliasColors": {
+ "Apps": "#629E51",
+ "Buffers": "#614D93",
+ "Cache": "#6D1F62",
+ "Cached": "#511749",
+ "Committed": "#508642",
+ "Free": "#0A437C",
+ "Hardware Corrupted - Amount of RAM that the kernel identified as corrupted / not working": "#CFFAFF",
+ "Inactive": "#584477",
+ "PageTables": "#0A50A1",
+ "Page_Tables": "#0A50A1",
+ "RAM_Free": "#E0F9D7",
+ "Slab": "#806EB7",
+ "Slab_Cache": "#E0752D",
+ "Swap": "#BF1B00",
+ "Swap_Cache": "#C15C17",
+ "Swap_Free": "#2F575E",
+ "Unused": "#EAB839"
+ },
+ "bars": false,
+ "dashLength": 10,
+ "dashes": false,
+ "datasource": "${DS_PROMETHEUS}",
+ "decimals": 2,
+ "fill": 2,
+ "fillGradient": 0,
+ "gridPos": {
+ "h": 10,
+ "w": 12,
+ "x": 0,
+ "y": 90
+ },
+ "hiddenSeries": false,
+ "id": 138,
+ "legend": {
+ "alignAsTable": true,
+ "avg": true,
+ "current": true,
+ "max": true,
+ "min": true,
+ "rightSide": false,
+ "show": true,
+ "sideWidth": 350,
+ "total": false,
+ "values": true
+ },
+ "lines": true,
+ "linewidth": 1,
+ "links": [],
+ "maxPerRow": 6,
+ "nullPointMode": "null",
+ "options": {
+ "dataLinks": []
+ },
+ "percentage": false,
+ "pointradius": 5,
+ "points": false,
+ "renderer": "flot",
+ "seriesOverrides": [
+ {
+ "$$hashKey": "object:4131",
+ "alias": "ShmemHugePages - Memory used by shared memory (shmem) and tmpfs allocated with huge pages",
+ "fill": 0
+ },
+ {
+ "$$hashKey": "object:4138",
+ "alias": "ShmemHugePages - Memory used by shared memory (shmem) and tmpfs allocated with huge pages",
+ "fill": 0
+ }
+ ],
+ "spaceLength": 10,
+ "stack": false,
+ "steppedLine": false,
+ "targets": [
+ {
+ "expr": "node_memory_Mapped_bytes{instance=\"$node\",job=\"$job\"}",
+ "format": "time_series",
+ "intervalFactor": 2,
+ "legendFormat": "Mapped - Used memory in mapped pages files which have been mmaped, such as libraries",
+ "refId": "A",
+ "step": 4
+ },
+ {
+ "expr": "node_memory_Shmem_bytes{instance=\"$node\",job=\"$job\"}",
+ "format": "time_series",
+ "intervalFactor": 2,
+ "legendFormat": "Shmem - Used shared memory (shared between several processes, thus including RAM disks)",
+ "refId": "B",
+ "step": 4
+ },
+ {
+ "expr": "node_memory_ShmemHugePages_bytes{instance=\"$node\",job=\"$job\"}",
+ "format": "time_series",
+ "interval": "",
+ "intervalFactor": 2,
+ "legendFormat": "ShmemHugePages - Memory used by shared memory (shmem) and tmpfs allocated with huge pages",
+ "refId": "C",
+ "step": 4
+ },
+ {
+ "expr": "node_memory_ShmemPmdMapped_bytes{instance=\"$node\",job=\"$job\"}",
+ "format": "time_series",
+ "interval": "",
+ "intervalFactor": 2,
+ "legendFormat": "ShmemPmdMapped - Ammount of shared (shmem/tmpfs) memory backed by huge pages",
+ "refId": "D",
+ "step": 4
+ }
+ ],
+ "thresholds": [],
+ "timeFrom": null,
+ "timeRegions": [],
+ "timeShift": null,
+ "title": "Memory Shared and Mapped",
+ "tooltip": {
+ "shared": true,
+ "sort": 0,
+ "value_type": "cumulative"
+ },
+ "type": "graph",
+ "xaxis": {
+ "buckets": null,
+ "mode": "time",
+ "name": null,
+ "show": true,
+ "values": []
+ },
+ "yaxes": [
+ {
+ "$$hashKey": "object:4106",
+ "format": "bytes",
+ "label": "bytes",
+ "logBase": 1,
+ "max": null,
+ "min": "0",
+ "show": true
+ },
+ {
+ "$$hashKey": "object:4107",
+ "format": "short",
+ "label": null,
+ "logBase": 1,
+ "max": null,
+ "min": null,
+ "show": false
+ }
+ ],
+ "yaxis": {
+ "align": false,
+ "alignLevel": null
+ }
+ },
+ {
+ "aliasColors": {
+ "Active": "#99440A",
+ "Buffers": "#58140C",
+ "Cache": "#6D1F62",
+ "Cached": "#511749",
+ "Committed": "#508642",
+ "Dirty": "#6ED0E0",
+ "Free": "#B7DBAB",
+ "Inactive": "#EA6460",
+ "Mapped": "#052B51",
+ "PageTables": "#0A50A1",
+ "Page_Tables": "#0A50A1",
+ "Slab_Cache": "#EAB839",
+ "Swap": "#BF1B00",
+ "Swap_Cache": "#C15C17",
+ "Total": "#511749",
+ "Total RAM": "#052B51",
+ "Total RAM + Swap": "#052B51",
+ "Total Swap": "#614D93",
+ "VmallocUsed": "#EA6460"
+ },
+ "bars": false,
+ "dashLength": 10,
+ "dashes": false,
+ "datasource": "${DS_PROMETHEUS}",
+ "decimals": 2,
+ "fill": 2,
+ "fillGradient": 0,
+ "gridPos": {
+ "h": 10,
+ "w": 12,
+ "x": 12,
+ "y": 90
+ },
+ "hiddenSeries": false,
+ "id": 131,
+ "legend": {
+ "alignAsTable": true,
+ "avg": true,
+ "current": true,
+ "max": true,
+ "min": true,
+ "rightSide": false,
+ "show": true,
+ "sideWidth": null,
+ "total": false,
+ "values": true
+ },
+ "lines": true,
+ "linewidth": 1,
+ "links": [],
+ "maxPerRow": 2,
+ "nullPointMode": "null",
+ "options": {
+ "dataLinks": []
+ },
+ "percentage": false,
+ "pointradius": 5,
+ "points": false,
+ "renderer": "flot",
+ "seriesOverrides": [],
+ "spaceLength": 10,
+ "stack": true,
+ "steppedLine": false,
+ "targets": [
+ {
+ "expr": "node_memory_SUnreclaim_bytes{instance=\"$node\",job=\"$job\"}",
+ "format": "time_series",
+ "intervalFactor": 2,
+ "legendFormat": "SUnreclaim - Part of Slab, that cannot be reclaimed on memory pressure",
+ "refId": "A",
+ "step": 4
+ },
+ {
+ "expr": "node_memory_SReclaimable_bytes{instance=\"$node\",job=\"$job\"}",
+ "format": "time_series",
+ "intervalFactor": 2,
+ "legendFormat": "SReclaimable - Part of Slab, that might be reclaimed, such as caches",
+ "refId": "B",
+ "step": 4
+ }
+ ],
+ "thresholds": [],
+ "timeFrom": null,
+ "timeRegions": [],
+ "timeShift": null,
+ "title": "Memory Slab",
+ "tooltip": {
+ "shared": true,
+ "sort": 0,
+ "value_type": "individual"
+ },
+ "type": "graph",
+ "xaxis": {
+ "buckets": null,
+ "mode": "time",
+ "name": null,
+ "show": true,
+ "values": []
+ },
+ "yaxes": [
+ {
+ "format": "bytes",
+ "label": "bytes",
+ "logBase": 1,
+ "max": null,
+ "min": "0",
+ "show": true
+ },
+ {
+ "format": "short",
+ "label": null,
+ "logBase": 1,
+ "max": null,
+ "min": null,
+ "show": false
+ }
+ ],
+ "yaxis": {
+ "align": false,
+ "alignLevel": null
+ }
+ },
+ {
+ "aliasColors": {
+ "Active": "#99440A",
+ "Buffers": "#58140C",
+ "Cache": "#6D1F62",
+ "Cached": "#511749",
+ "Committed": "#508642",
+ "Dirty": "#6ED0E0",
+ "Free": "#B7DBAB",
+ "Inactive": "#EA6460",
+ "Mapped": "#052B51",
+ "PageTables": "#0A50A1",
+ "Page_Tables": "#0A50A1",
+ "Slab_Cache": "#EAB839",
+ "Swap": "#BF1B00",
+ "Swap_Cache": "#C15C17",
+ "Total": "#511749",
+ "Total RAM": "#052B51",
+ "Total RAM + Swap": "#052B51",
+ "VmallocUsed": "#EA6460"
+ },
+ "bars": false,
+ "dashLength": 10,
+ "dashes": false,
+ "datasource": "${DS_PROMETHEUS}",
+ "decimals": 2,
+ "fill": 2,
+ "fillGradient": 0,
+ "gridPos": {
+ "h": 10,
+ "w": 12,
+ "x": 0,
+ "y": 100
+ },
+ "hiddenSeries": false,
+ "id": 70,
+ "legend": {
+ "alignAsTable": true,
+ "avg": true,
+ "current": true,
+ "max": true,
+ "min": true,
+ "rightSide": false,
+ "show": true,
+ "sideWidth": null,
+ "total": false,
+ "values": true
+ },
+ "lines": true,
+ "linewidth": 1,
+ "links": [],
+ "maxPerRow": 6,
+ "nullPointMode": "null",
+ "options": {
+ "dataLinks": []
+ },
+ "percentage": false,
+ "pointradius": 5,
+ "points": false,
+ "renderer": "flot",
+ "seriesOverrides": [],
+ "spaceLength": 10,
+ "stack": false,
+ "steppedLine": false,
+ "targets": [
+ {
+ "expr": "node_memory_VmallocChunk_bytes{instance=\"$node\",job=\"$job\"}",
+ "format": "time_series",
+ "hide": false,
+ "intervalFactor": 2,
+ "legendFormat": "VmallocChunk - Largest contigious block of vmalloc area which is free",
+ "refId": "A",
+ "step": 4
+ },
+ {
+ "expr": "node_memory_VmallocTotal_bytes{instance=\"$node\",job=\"$job\"}",
+ "format": "time_series",
+ "hide": false,
+ "intervalFactor": 2,
+ "legendFormat": "VmallocTotal - Total size of vmalloc memory area",
+ "refId": "B",
+ "step": 4
+ },
+ {
+ "expr": "node_memory_VmallocUsed_bytes{instance=\"$node\",job=\"$job\"}",
+ "format": "time_series",
+ "hide": false,
+ "intervalFactor": 2,
+ "legendFormat": "VmallocUsed - Amount of vmalloc area which is used",
+ "refId": "C",
+ "step": 4
+ }
+ ],
+ "thresholds": [],
+ "timeFrom": null,
+ "timeRegions": [],
+ "timeShift": null,
+ "title": "Memory Vmalloc",
+ "tooltip": {
+ "shared": true,
+ "sort": 0,
+ "value_type": "individual"
+ },
+ "type": "graph",
+ "xaxis": {
+ "buckets": null,
+ "mode": "time",
+ "name": null,
+ "show": true,
+ "values": []
+ },
+ "yaxes": [
+ {
+ "format": "bytes",
+ "label": "bytes",
+ "logBase": 1,
+ "max": null,
+ "min": "0",
+ "show": true
+ },
+ {
+ "format": "short",
+ "label": null,
+ "logBase": 1,
+ "max": null,
+ "min": null,
+ "show": false
+ }
+ ],
+ "yaxis": {
+ "align": false,
+ "alignLevel": null
+ }
+ },
+ {
+ "aliasColors": {
+ "Apps": "#629E51",
+ "Buffers": "#614D93",
+ "Cache": "#6D1F62",
+ "Cached": "#511749",
+ "Committed": "#508642",
+ "Free": "#0A437C",
+ "Hardware Corrupted - Amount of RAM that the kernel identified as corrupted / not working": "#CFFAFF",
+ "Inactive": "#584477",
+ "PageTables": "#0A50A1",
+ "Page_Tables": "#0A50A1",
+ "RAM_Free": "#E0F9D7",
+ "Slab": "#806EB7",
+ "Slab_Cache": "#E0752D",
+ "Swap": "#BF1B00",
+ "Swap_Cache": "#C15C17",
+ "Swap_Free": "#2F575E",
+ "Unused": "#EAB839"
+ },
+ "bars": false,
+ "dashLength": 10,
+ "dashes": false,
+ "datasource": "${DS_PROMETHEUS}",
+ "decimals": 2,
+ "fill": 2,
+ "fillGradient": 0,
+ "gridPos": {
+ "h": 10,
+ "w": 12,
+ "x": 12,
+ "y": 100
+ },
+ "hiddenSeries": false,
+ "id": 159,
+ "legend": {
+ "alignAsTable": true,
+ "avg": true,
+ "current": true,
+ "max": true,
+ "min": true,
+ "rightSide": false,
+ "show": true,
+ "sideWidth": 350,
+ "total": false,
+ "values": true
+ },
+ "lines": true,
+ "linewidth": 1,
+ "links": [],
+ "maxPerRow": 6,
+ "nullPointMode": "null",
+ "options": {
+ "dataLinks": []
+ },
+ "percentage": false,
+ "pointradius": 5,
+ "points": false,
+ "renderer": "flot",
+ "seriesOverrides": [],
+ "spaceLength": 10,
+ "stack": false,
+ "steppedLine": false,
+ "targets": [
+ {
+ "expr": "node_memory_Bounce_bytes{instance=\"$node\",job=\"$job\"}",
+ "format": "time_series",
+ "intervalFactor": 2,
+ "legendFormat": "Bounce - Memory used for block device bounce buffers",
+ "refId": "A",
+ "step": 4
+ }
+ ],
+ "thresholds": [],
+ "timeFrom": null,
+ "timeRegions": [],
+ "timeShift": null,
+ "title": "Memory Bounce",
+ "tooltip": {
+ "shared": true,
+ "sort": 0,
+ "value_type": "cumulative"
+ },
+ "type": "graph",
+ "xaxis": {
+ "buckets": null,
+ "mode": "time",
+ "name": null,
+ "show": true,
+ "values": []
+ },
+ "yaxes": [
+ {
+ "format": "bytes",
+ "label": "bytes",
+ "logBase": 1,
+ "max": null,
+ "min": "0",
+ "show": true
+ },
+ {
+ "format": "short",
+ "label": null,
+ "logBase": 1,
+ "max": null,
+ "min": null,
+ "show": false
+ }
+ ],
+ "yaxis": {
+ "align": false,
+ "alignLevel": null
+ }
+ },
+ {
+ "aliasColors": {
+ "Active": "#99440A",
+ "Buffers": "#58140C",
+ "Cache": "#6D1F62",
+ "Cached": "#511749",
+ "Committed": "#508642",
+ "Dirty": "#6ED0E0",
+ "Free": "#B7DBAB",
+ "Inactive": "#EA6460",
+ "Mapped": "#052B51",
+ "PageTables": "#0A50A1",
+ "Page_Tables": "#0A50A1",
+ "Slab_Cache": "#EAB839",
+ "Swap": "#BF1B00",
+ "Swap_Cache": "#C15C17",
+ "Total": "#511749",
+ "Total RAM": "#052B51",
+ "Total RAM + Swap": "#052B51",
+ "VmallocUsed": "#EA6460"
+ },
+ "bars": false,
+ "dashLength": 10,
+ "dashes": false,
+ "datasource": "${DS_PROMETHEUS}",
+ "decimals": 2,
+ "fill": 2,
+ "fillGradient": 0,
+ "gridPos": {
+ "h": 10,
+ "w": 12,
+ "x": 0,
+ "y": 110
+ },
+ "hiddenSeries": false,
+ "id": 129,
+ "legend": {
+ "alignAsTable": true,
+ "avg": true,
+ "current": true,
+ "max": true,
+ "min": true,
+ "rightSide": false,
+ "show": true,
+ "sideWidth": null,
+ "total": false,
+ "values": true
+ },
+ "lines": true,
+ "linewidth": 1,
+ "links": [],
+ "maxPerRow": 6,
+ "nullPointMode": "null",
+ "options": {
+ "dataLinks": []
+ },
+ "percentage": false,
+ "pointradius": 5,
+ "points": false,
+ "renderer": "flot",
+ "seriesOverrides": [
+ {
+ "alias": "/.*Inactive *./",
+ "transform": "negative-Y"
+ }
+ ],
+ "spaceLength": 10,
+ "stack": false,
+ "steppedLine": false,
+ "targets": [
+ {
+ "expr": "node_memory_AnonHugePages_bytes{instance=\"$node\",job=\"$job\"}",
+ "format": "time_series",
+ "intervalFactor": 2,
+ "legendFormat": "AnonHugePages - Memory in anonymous huge pages",
+ "refId": "A",
+ "step": 4
+ },
+ {
+ "expr": "node_memory_AnonPages_bytes{instance=\"$node\",job=\"$job\"}",
+ "format": "time_series",
+ "intervalFactor": 2,
+ "legendFormat": "AnonPages - Memory in user pages not backed by files",
+ "refId": "B",
+ "step": 4
+ }
+ ],
+ "thresholds": [],
+ "timeFrom": null,
+ "timeRegions": [],
+ "timeShift": null,
+ "title": "Memory Anonymous",
+ "tooltip": {
+ "shared": true,
+ "sort": 0,
+ "value_type": "individual"
+ },
+ "type": "graph",
+ "xaxis": {
+ "buckets": null,
+ "mode": "time",
+ "name": null,
+ "show": true,
+ "values": []
+ },
+ "yaxes": [
+ {
+ "format": "bytes",
+ "label": "bytes",
+ "logBase": 1,
+ "max": null,
+ "min": "0",
+ "show": true
+ },
+ {
+ "format": "short",
+ "label": null,
+ "logBase": 1,
+ "max": null,
+ "min": null,
+ "show": false
+ }
+ ],
+ "yaxis": {
+ "align": false,
+ "alignLevel": null
+ }
+ },
+ {
+ "aliasColors": {
+ "Apps": "#629E51",
+ "Buffers": "#614D93",
+ "Cache": "#6D1F62",
+ "Cached": "#511749",
+ "Committed": "#508642",
+ "Free": "#0A437C",
+ "Hardware Corrupted - Amount of RAM that the kernel identified as corrupted / not working": "#CFFAFF",
+ "Inactive": "#584477",
+ "PageTables": "#0A50A1",
+ "Page_Tables": "#0A50A1",
+ "RAM_Free": "#E0F9D7",
+ "Slab": "#806EB7",
+ "Slab_Cache": "#E0752D",
+ "Swap": "#BF1B00",
+ "Swap_Cache": "#C15C17",
+ "Swap_Free": "#2F575E",
+ "Unused": "#EAB839"
+ },
+ "bars": false,
+ "dashLength": 10,
+ "dashes": false,
+ "datasource": "${DS_PROMETHEUS}",
+ "decimals": 2,
+ "fill": 2,
+ "fillGradient": 0,
+ "gridPos": {
+ "h": 10,
+ "w": 12,
+ "x": 12,
+ "y": 110
+ },
+ "hiddenSeries": false,
+ "id": 160,
+ "legend": {
+ "alignAsTable": true,
+ "avg": true,
+ "current": true,
+ "max": true,
+ "min": true,
+ "rightSide": false,
+ "show": true,
+ "sideWidth": 350,
+ "total": false,
+ "values": true
+ },
+ "lines": true,
+ "linewidth": 1,
+ "links": [],
+ "maxPerRow": 2,
+ "nullPointMode": "null",
+ "options": {
+ "dataLinks": []
+ },
+ "percentage": false,
+ "pointradius": 5,
+ "points": false,
+ "renderer": "flot",
+ "seriesOverrides": [],
+ "spaceLength": 10,
+ "stack": false,
+ "steppedLine": false,
+ "targets": [
+ {
+ "expr": "node_memory_KernelStack_bytes{instance=\"$node\",job=\"$job\"}",
+ "format": "time_series",
+ "intervalFactor": 2,
+ "legendFormat": "KernelStack - Kernel memory stack. This is not reclaimable",
+ "refId": "A",
+ "step": 4
+ },
+ {
+ "expr": "node_memory_Percpu_bytes{instance=\"$node\",job=\"$job\"}",
+ "format": "time_series",
+ "interval": "",
+ "intervalFactor": 2,
+ "legendFormat": "PerCPU - Per CPU memory allocated dynamically by loadable modules",
+ "refId": "B",
+ "step": 4
+ }
+ ],
+ "thresholds": [],
+ "timeFrom": null,
+ "timeRegions": [],
+ "timeShift": null,
+ "title": "Memory Kernel / CPU",
+ "tooltip": {
+ "shared": true,
+ "sort": 0,
+ "value_type": "cumulative"
+ },
+ "type": "graph",
+ "xaxis": {
+ "buckets": null,
+ "mode": "time",
+ "name": null,
+ "show": true,
+ "values": []
+ },
+ "yaxes": [
+ {
+ "format": "bytes",
+ "label": "bytes",
+ "logBase": 1,
+ "max": null,
+ "min": "0",
+ "show": true
+ },
+ {
+ "format": "short",
+ "label": null,
+ "logBase": 1,
+ "max": null,
+ "min": null,
+ "show": false
+ }
+ ],
+ "yaxis": {
+ "align": false,
+ "alignLevel": null
+ }
+ },
+ {
+ "aliasColors": {
+ "Active": "#99440A",
+ "Buffers": "#58140C",
+ "Cache": "#6D1F62",
+ "Cached": "#511749",
+ "Committed": "#508642",
+ "Dirty": "#6ED0E0",
+ "Free": "#B7DBAB",
+ "Inactive": "#EA6460",
+ "Mapped": "#052B51",
+ "PageTables": "#0A50A1",
+ "Page_Tables": "#0A50A1",
+ "Slab_Cache": "#EAB839",
+ "Swap": "#BF1B00",
+ "Swap_Cache": "#C15C17",
+ "Total": "#511749",
+ "Total RAM": "#806EB7",
+ "Total RAM + Swap": "#806EB7",
+ "VmallocUsed": "#EA6460"
+ },
+ "bars": false,
+ "dashLength": 10,
+ "dashes": false,
+ "datasource": "${DS_PROMETHEUS}",
+ "decimals": 2,
+ "fill": 2,
+ "fillGradient": 0,
+ "gridPos": {
+ "h": 10,
+ "w": 12,
+ "x": 0,
+ "y": 120
+ },
+ "hiddenSeries": false,
+ "id": 140,
+ "legend": {
+ "alignAsTable": true,
+ "avg": false,
+ "current": true,
+ "max": true,
+ "min": true,
+ "rightSide": false,
+ "show": true,
+ "sideWidth": null,
+ "total": false,
+ "values": true
+ },
+ "lines": true,
+ "linewidth": 1,
+ "links": [],
+ "maxPerRow": 6,
+ "nullPointMode": "null",
+ "options": {
+ "dataLinks": []
+ },
+ "percentage": false,
+ "pointradius": 5,
+ "points": false,
+ "renderer": "flot",
+ "seriesOverrides": [],
+ "spaceLength": 10,
+ "stack": false,
+ "steppedLine": false,
+ "targets": [
+ {
+ "expr": "node_memory_HugePages_Free{instance=\"$node\",job=\"$job\"}",
+ "format": "time_series",
+ "intervalFactor": 2,
+ "legendFormat": "HugePages_Free - Huge pages in the pool that are not yet allocated",
+ "refId": "A",
+ "step": 4
+ },
+ {
+ "expr": "node_memory_HugePages_Rsvd{instance=\"$node\",job=\"$job\"}",
+ "format": "time_series",
+ "intervalFactor": 2,
+ "legendFormat": "HugePages_Rsvd - Huge pages for which a commitment to allocate from the pool has been made, but no allocation has yet been made",
+ "refId": "B",
+ "step": 4
+ },
+ {
+ "expr": "node_memory_HugePages_Surp{instance=\"$node\",job=\"$job\"}",
+ "format": "time_series",
+ "intervalFactor": 2,
+ "legendFormat": "HugePages_Surp - Huge pages in the pool above the value in /proc/sys/vm/nr_hugepages",
+ "refId": "C",
+ "step": 4
+ }
+ ],
+ "thresholds": [],
+ "timeFrom": null,
+ "timeRegions": [],
+ "timeShift": null,
+ "title": "Memory HugePages Counter",
+ "tooltip": {
+ "shared": true,
+ "sort": 0,
+ "value_type": "individual"
+ },
+ "type": "graph",
+ "xaxis": {
+ "buckets": null,
+ "mode": "time",
+ "name": null,
+ "show": true,
+ "values": []
+ },
+ "yaxes": [
+ {
+ "format": "short",
+ "label": "pages",
+ "logBase": 1,
+ "max": null,
+ "min": "0",
+ "show": true
+ },
+ {
+ "format": "short",
+ "label": "",
+ "logBase": 1,
+ "max": null,
+ "min": null,
+ "show": false
+ }
+ ],
+ "yaxis": {
+ "align": false,
+ "alignLevel": null
+ }
+ },
+ {
+ "aliasColors": {
+ "Active": "#99440A",
+ "Buffers": "#58140C",
+ "Cache": "#6D1F62",
+ "Cached": "#511749",
+ "Committed": "#508642",
+ "Dirty": "#6ED0E0",
+ "Free": "#B7DBAB",
+ "Inactive": "#EA6460",
+ "Mapped": "#052B51",
+ "PageTables": "#0A50A1",
+ "Page_Tables": "#0A50A1",
+ "Slab_Cache": "#EAB839",
+ "Swap": "#BF1B00",
+ "Swap_Cache": "#C15C17",
+ "Total": "#511749",
+ "Total RAM": "#806EB7",
+ "Total RAM + Swap": "#806EB7",
+ "VmallocUsed": "#EA6460"
+ },
+ "bars": false,
+ "dashLength": 10,
+ "dashes": false,
+ "datasource": "${DS_PROMETHEUS}",
+ "decimals": 2,
+ "fill": 2,
+ "fillGradient": 0,
+ "gridPos": {
+ "h": 10,
+ "w": 12,
+ "x": 12,
+ "y": 120
+ },
+ "hiddenSeries": false,
+ "id": 71,
+ "legend": {
+ "alignAsTable": true,
+ "avg": false,
+ "current": true,
+ "max": true,
+ "min": true,
+ "rightSide": false,
+ "show": true,
+ "sideWidth": null,
+ "total": false,
+ "values": true
+ },
+ "lines": true,
+ "linewidth": 1,
+ "links": [],
+ "maxPerRow": 2,
+ "nullPointMode": "null",
+ "options": {
+ "dataLinks": []
+ },
+ "percentage": false,
+ "pointradius": 5,
+ "points": false,
+ "renderer": "flot",
+ "seriesOverrides": [],
+ "spaceLength": 10,
+ "stack": false,
+ "steppedLine": false,
+ "targets": [
+ {
+ "expr": "node_memory_HugePages_Total{instance=\"$node\",job=\"$job\"}",
+ "format": "time_series",
+ "intervalFactor": 2,
+ "legendFormat": "HugePages - Total size of the pool of huge pages",
+ "refId": "A",
+ "step": 4
+ },
+ {
+ "expr": "node_memory_Hugepagesize_bytes{instance=\"$node\",job=\"$job\"}",
+ "format": "time_series",
+ "intervalFactor": 2,
+ "legendFormat": "Hugepagesize - Huge Page size",
+ "refId": "B",
+ "step": 4
+ }
+ ],
+ "thresholds": [],
+ "timeFrom": null,
+ "timeRegions": [],
+ "timeShift": null,
+ "title": "Memory HugePages Size",
+ "tooltip": {
+ "shared": true,
+ "sort": 0,
+ "value_type": "individual"
+ },
+ "type": "graph",
+ "xaxis": {
+ "buckets": null,
+ "mode": "time",
+ "name": null,
+ "show": true,
+ "values": []
+ },
+ "yaxes": [
+ {
+ "format": "bytes",
+ "label": "bytes",
+ "logBase": 1,
+ "max": null,
+ "min": "0",
+ "show": true
+ },
+ {
+ "format": "short",
+ "label": "",
+ "logBase": 1,
+ "max": null,
+ "min": null,
+ "show": false
+ }
+ ],
+ "yaxis": {
+ "align": false,
+ "alignLevel": null
+ }
+ },
+ {
+ "aliasColors": {
+ "Active": "#99440A",
+ "Buffers": "#58140C",
+ "Cache": "#6D1F62",
+ "Cached": "#511749",
+ "Committed": "#508642",
+ "Dirty": "#6ED0E0",
+ "Free": "#B7DBAB",
+ "Inactive": "#EA6460",
+ "Mapped": "#052B51",
+ "PageTables": "#0A50A1",
+ "Page_Tables": "#0A50A1",
+ "Slab_Cache": "#EAB839",
+ "Swap": "#BF1B00",
+ "Swap_Cache": "#C15C17",
+ "Total": "#511749",
+ "Total RAM": "#052B51",
+ "Total RAM + Swap": "#052B51",
+ "VmallocUsed": "#EA6460"
+ },
+ "bars": false,
+ "dashLength": 10,
+ "dashes": false,
+ "datasource": "${DS_PROMETHEUS}",
+ "decimals": 2,
+ "fill": 2,
+ "fillGradient": 0,
+ "gridPos": {
+ "h": 10,
+ "w": 12,
+ "x": 0,
+ "y": 130
+ },
+ "hiddenSeries": false,
+ "id": 128,
+ "legend": {
+ "alignAsTable": true,
+ "avg": true,
+ "current": false,
+ "hideEmpty": false,
+ "hideZero": false,
+ "max": true,
+ "min": true,
+ "rightSide": false,
+ "show": true,
+ "sideWidth": null,
+ "total": false,
+ "values": true
+ },
+ "lines": true,
+ "linewidth": 1,
+ "links": [],
+ "maxPerRow": 6,
+ "nullPointMode": "null",
+ "options": {
+ "dataLinks": []
+ },
+ "percentage": false,
+ "pointradius": 5,
+ "points": false,
+ "renderer": "flot",
+ "seriesOverrides": [],
+ "spaceLength": 10,
+ "stack": false,
+ "steppedLine": false,
+ "targets": [
+ {
+ "expr": "node_memory_DirectMap1G_bytes{instance=\"$node\",job=\"$job\"}",
+ "format": "time_series",
+ "intervalFactor": 2,
+ "legendFormat": "DirectMap1G - Amount of pages mapped as this size",
+ "refId": "A",
+ "step": 4
+ },
+ {
+ "expr": "node_memory_DirectMap2M_bytes{instance=\"$node\",job=\"$job\"}",
+ "format": "time_series",
+ "interval": "",
+ "intervalFactor": 2,
+ "legendFormat": "DirectMap2M - Amount of pages mapped as this size",
+ "refId": "B",
+ "step": 4
+ },
+ {
+ "expr": "node_memory_DirectMap4k_bytes{instance=\"$node\",job=\"$job\"}",
+ "format": "time_series",
+ "interval": "",
+ "intervalFactor": 2,
+ "legendFormat": "DirectMap4K - Amount of pages mapped as this size",
+ "refId": "C",
+ "step": 4
+ }
+ ],
+ "thresholds": [],
+ "timeFrom": null,
+ "timeRegions": [],
+ "timeShift": null,
+ "title": "Memory DirectMap",
+ "tooltip": {
+ "shared": true,
+ "sort": 0,
+ "value_type": "individual"
+ },
+ "type": "graph",
+ "xaxis": {
+ "buckets": null,
+ "mode": "time",
+ "name": null,
+ "show": true,
+ "values": []
+ },
+ "yaxes": [
+ {
+ "format": "bytes",
+ "label": "bytes",
+ "logBase": 1,
+ "max": null,
+ "min": "0",
+ "show": true
+ },
+ {
+ "format": "short",
+ "label": null,
+ "logBase": 1,
+ "max": null,
+ "min": null,
+ "show": false
+ }
+ ],
+ "yaxis": {
+ "align": false,
+ "alignLevel": null
+ }
+ },
+ {
+ "aliasColors": {
+ "Apps": "#629E51",
+ "Buffers": "#614D93",
+ "Cache": "#6D1F62",
+ "Cached": "#511749",
+ "Committed": "#508642",
+ "Free": "#0A437C",
+ "Hardware Corrupted - Amount of RAM that the kernel identified as corrupted / not working": "#CFFAFF",
+ "Inactive": "#584477",
+ "PageTables": "#0A50A1",
+ "Page_Tables": "#0A50A1",
+ "RAM_Free": "#E0F9D7",
+ "Slab": "#806EB7",
+ "Slab_Cache": "#E0752D",
+ "Swap": "#BF1B00",
+ "Swap_Cache": "#C15C17",
+ "Swap_Free": "#2F575E",
+ "Unused": "#EAB839"
+ },
+ "bars": false,
+ "dashLength": 10,
+ "dashes": false,
+ "datasource": "${DS_PROMETHEUS}",
+ "decimals": 2,
+ "fill": 2,
+ "fillGradient": 0,
+ "gridPos": {
+ "h": 10,
+ "w": 12,
+ "x": 12,
+ "y": 130
+ },
+ "hiddenSeries": false,
+ "id": 137,
+ "legend": {
+ "alignAsTable": true,
+ "avg": true,
+ "current": true,
+ "max": true,
+ "min": true,
+ "rightSide": false,
+ "show": true,
+ "sideWidth": 350,
+ "total": false,
+ "values": true
+ },
+ "lines": true,
+ "linewidth": 1,
+ "links": [],
+ "maxPerRow": 6,
+ "nullPointMode": "null",
+ "options": {
+ "dataLinks": []
+ },
+ "percentage": false,
+ "pointradius": 5,
+ "points": false,
+ "renderer": "flot",
+ "seriesOverrides": [],
+ "spaceLength": 10,
+ "stack": false,
+ "steppedLine": false,
+ "targets": [
+ {
+ "expr": "node_memory_Unevictable_bytes{instance=\"$node\",job=\"$job\"}",
+ "format": "time_series",
+ "intervalFactor": 2,
+ "legendFormat": "Unevictable - Amount of unevictable memory that can't be swapped out for a variety of reasons",
+ "refId": "A",
+ "step": 4
+ },
+ {
+ "expr": "node_memory_Mlocked_bytes{instance=\"$node\",job=\"$job\"}",
+ "format": "time_series",
+ "intervalFactor": 2,
+ "legendFormat": "MLocked - Size of pages locked to memory using the mlock() system call",
+ "refId": "B",
+ "step": 4
+ }
+ ],
+ "thresholds": [],
+ "timeFrom": null,
+ "timeRegions": [],
+ "timeShift": null,
+ "title": "Memory Unevictable and MLocked",
+ "tooltip": {
+ "shared": true,
+ "sort": 0,
+ "value_type": "cumulative"
+ },
+ "type": "graph",
+ "xaxis": {
+ "buckets": null,
+ "mode": "time",
+ "name": null,
+ "show": true,
+ "values": []
+ },
+ "yaxes": [
+ {
+ "format": "bytes",
+ "label": "bytes",
+ "logBase": 1,
+ "max": null,
+ "min": "0",
+ "show": true
+ },
+ {
+ "format": "short",
+ "label": null,
+ "logBase": 1,
+ "max": null,
+ "min": null,
+ "show": false
+ }
+ ],
+ "yaxis": {
+ "align": false,
+ "alignLevel": null
+ }
+ },
+ {
+ "aliasColors": {
+ "Active": "#99440A",
+ "Buffers": "#58140C",
+ "Cache": "#6D1F62",
+ "Cached": "#511749",
+ "Committed": "#508642",
+ "Dirty": "#6ED0E0",
+ "Free": "#B7DBAB",
+ "Inactive": "#EA6460",
+ "Mapped": "#052B51",
+ "PageTables": "#0A50A1",
+ "Page_Tables": "#0A50A1",
+ "Slab_Cache": "#EAB839",
+ "Swap": "#BF1B00",
+ "Swap_Cache": "#C15C17",
+ "Total": "#511749",
+ "Total RAM": "#052B51",
+ "Total RAM + Swap": "#052B51",
+ "Total Swap": "#614D93",
+ "VmallocUsed": "#EA6460"
+ },
+ "bars": false,
+ "dashLength": 10,
+ "dashes": false,
+ "datasource": "${DS_PROMETHEUS}",
+ "decimals": 2,
+ "fill": 2,
+ "fillGradient": 0,
+ "gridPos": {
+ "h": 10,
+ "w": 12,
+ "x": 0,
+ "y": 140
+ },
+ "hiddenSeries": false,
+ "id": 132,
+ "legend": {
+ "alignAsTable": true,
+ "avg": true,
+ "current": true,
+ "max": true,
+ "min": true,
+ "rightSide": false,
+ "show": true,
+ "sideWidth": null,
+ "total": false,
+ "values": true
+ },
+ "lines": true,
+ "linewidth": 1,
+ "links": [],
+ "maxPerRow": 6,
+ "nullPointMode": "null",
+ "options": {
+ "dataLinks": []
+ },
+ "percentage": false,
+ "pointradius": 5,
+ "points": false,
+ "renderer": "flot",
+ "seriesOverrides": [],
+ "spaceLength": 10,
+ "stack": false,
+ "steppedLine": false,
+ "targets": [
+ {
+ "expr": "node_memory_NFS_Unstable_bytes{instance=\"$node\",job=\"$job\"}",
+ "format": "time_series",
+ "intervalFactor": 2,
+ "legendFormat": "NFS Unstable - Memory in NFS pages sent to the server, but not yet commited to the storage",
+ "refId": "A",
+ "step": 4
+ }
+ ],
+ "thresholds": [],
+ "timeFrom": null,
+ "timeRegions": [],
+ "timeShift": null,
+ "title": "Memory NFS",
+ "tooltip": {
+ "shared": true,
+ "sort": 0,
+ "value_type": "individual"
+ },
+ "type": "graph",
+ "xaxis": {
+ "buckets": null,
+ "mode": "time",
+ "name": null,
+ "show": true,
+ "values": []
+ },
+ "yaxes": [
+ {
+ "format": "bytes",
+ "label": "bytes",
+ "logBase": 1,
+ "max": null,
+ "min": "0",
+ "show": true
+ },
+ {
+ "format": "short",
+ "label": null,
+ "logBase": 1,
+ "max": null,
+ "min": null,
+ "show": false
+ }
+ ],
+ "yaxis": {
+ "align": false,
+ "alignLevel": null
+ }
+ }
+ ],
+ "repeat": null,
+ "title": "Memory Meminfo",
+ "type": "row"
+ },
+ {
+ "collapsed": true,
+ "datasource": "${DS_PROMETHEUS}",
+ "gridPos": {
+ "h": 1,
+ "w": 24,
+ "x": 0,
+ "y": 22
+ },
+ "id": 267,
+ "panels": [
+ {
+ "aliasColors": {},
+ "bars": false,
+ "dashLength": 10,
+ "dashes": false,
+ "datasource": "${DS_PROMETHEUS}",
+ "fill": 2,
+ "fillGradient": 0,
+ "gridPos": {
+ "h": 10,
+ "w": 12,
+ "x": 0,
+ "y": 23
+ },
+ "hiddenSeries": false,
+ "id": 176,
+ "legend": {
+ "alignAsTable": true,
+ "avg": true,
+ "current": true,
+ "max": true,
+ "min": true,
+ "rightSide": false,
+ "show": true,
+ "total": false,
+ "values": true
+ },
+ "lines": true,
+ "linewidth": 1,
+ "links": [],
+ "maxPerRow": 6,
+ "nullPointMode": "null",
+ "options": {
+ "dataLinks": []
+ },
+ "percentage": false,
+ "pointradius": 5,
+ "points": false,
+ "renderer": "flot",
+ "seriesOverrides": [
+ {
+ "alias": "/.*out/",
+ "transform": "negative-Y"
+ }
+ ],
+ "spaceLength": 10,
+ "stack": false,
+ "steppedLine": false,
+ "targets": [
+ {
+ "expr": "irate(node_vmstat_pgpgin{instance=\"$node\",job=\"$job\"}[5m])",
+ "format": "time_series",
+ "intervalFactor": 2,
+ "legendFormat": "Pagesin - Page in operations",
+ "refId": "A",
+ "step": 4
+ },
+ {
+ "expr": "irate(node_vmstat_pgpgout{instance=\"$node\",job=\"$job\"}[5m])",
+ "format": "time_series",
+ "intervalFactor": 2,
+ "legendFormat": "Pagesout - Page out operations",
+ "refId": "B",
+ "step": 4
+ }
+ ],
+ "thresholds": [],
+ "timeFrom": null,
+ "timeRegions": [],
+ "timeShift": null,
+ "title": "Memory Pages In / Out",
+ "tooltip": {
+ "shared": true,
+ "sort": 0,
+ "value_type": "individual"
+ },
+ "type": "graph",
+ "xaxis": {
+ "buckets": null,
+ "mode": "time",
+ "name": null,
+ "show": true,
+ "values": []
+ },
+ "yaxes": [
+ {
+ "format": "short",
+ "label": "pages out (-) / in (+)",
+ "logBase": 1,
+ "max": null,
+ "min": null,
+ "show": true
+ },
+ {
+ "format": "short",
+ "label": null,
+ "logBase": 1,
+ "max": null,
+ "min": null,
+ "show": false
+ }
+ ],
+ "yaxis": {
+ "align": false,
+ "alignLevel": null
+ }
+ },
+ {
+ "aliasColors": {},
+ "bars": false,
+ "dashLength": 10,
+ "dashes": false,
+ "datasource": "${DS_PROMETHEUS}",
+ "fill": 2,
+ "fillGradient": 0,
+ "gridPos": {
+ "h": 10,
+ "w": 12,
+ "x": 12,
+ "y": 23
+ },
+ "hiddenSeries": false,
+ "id": 22,
+ "legend": {
+ "alignAsTable": true,
+ "avg": true,
+ "current": true,
+ "max": true,
+ "min": true,
+ "rightSide": false,
+ "show": true,
+ "total": false,
+ "values": true
+ },
+ "lines": true,
+ "linewidth": 1,
+ "links": [],
+ "maxPerRow": 6,
+ "nullPointMode": "null",
+ "options": {
+ "dataLinks": []
+ },
+ "percentage": false,
+ "pointradius": 5,
+ "points": false,
+ "renderer": "flot",
+ "seriesOverrides": [
+ {
+ "alias": "/.*out/",
+ "transform": "negative-Y"
+ }
+ ],
+ "spaceLength": 10,
+ "stack": false,
+ "steppedLine": false,
+ "targets": [
+ {
+ "expr": "irate(node_vmstat_pswpin{instance=\"$node\",job=\"$job\"}[5m])",
+ "format": "time_series",
+ "intervalFactor": 2,
+ "legendFormat": "Pswpin - Pages swapped in",
+ "refId": "A",
+ "step": 4
+ },
+ {
+ "expr": "irate(node_vmstat_pswpout{instance=\"$node\",job=\"$job\"}[5m])",
+ "format": "time_series",
+ "intervalFactor": 2,
+ "legendFormat": "Pswpout - Pages swapped out",
+ "refId": "B",
+ "step": 4
+ }
+ ],
+ "thresholds": [],
+ "timeFrom": null,
+ "timeRegions": [],
+ "timeShift": null,
+ "title": "Memory Pages Swap In / Out",
+ "tooltip": {
+ "shared": true,
+ "sort": 0,
+ "value_type": "individual"
+ },
+ "type": "graph",
+ "xaxis": {
+ "buckets": null,
+ "mode": "time",
+ "name": null,
+ "show": true,
+ "values": []
+ },
+ "yaxes": [
+ {
+ "format": "short",
+ "label": "pages out (-) / in (+)",
+ "logBase": 1,
+ "max": null,
+ "min": null,
+ "show": true
+ },
+ {
+ "format": "short",
+ "label": null,
+ "logBase": 1,
+ "max": null,
+ "min": null,
+ "show": false
+ }
+ ],
+ "yaxis": {
+ "align": false,
+ "alignLevel": null
+ }
+ },
+ {
+ "aliasColors": {
+ "Apps": "#629E51",
+ "Buffers": "#614D93",
+ "Cache": "#6D1F62",
+ "Cached": "#511749",
+ "Committed": "#508642",
+ "Free": "#0A437C",
+ "Hardware Corrupted - Amount of RAM that the kernel identified as corrupted / not working": "#CFFAFF",
+ "Inactive": "#584477",
+ "PageTables": "#0A50A1",
+ "Page_Tables": "#0A50A1",
+ "RAM_Free": "#E0F9D7",
+ "Slab": "#806EB7",
+ "Slab_Cache": "#E0752D",
+ "Swap": "#BF1B00",
+ "Swap_Cache": "#C15C17",
+ "Swap_Free": "#2F575E",
+ "Unused": "#EAB839"
+ },
+ "bars": false,
+ "dashLength": 10,
+ "dashes": false,
+ "datasource": "${DS_PROMETHEUS}",
+ "decimals": 2,
+ "fill": 2,
+ "fillGradient": 0,
+ "gridPos": {
+ "h": 10,
+ "w": 12,
+ "x": 0,
+ "y": 33
+ },
+ "hiddenSeries": false,
+ "id": 175,
+ "legend": {
+ "alignAsTable": true,
+ "avg": true,
+ "current": true,
+ "max": true,
+ "min": true,
+ "rightSide": false,
+ "show": true,
+ "sideWidth": 350,
+ "total": false,
+ "values": true
+ },
+ "lines": true,
+ "linewidth": 1,
+ "links": [],
+ "maxPerRow": 6,
+ "nullPointMode": "null",
+ "options": {
+ "dataLinks": []
+ },
+ "percentage": false,
+ "pointradius": 5,
+ "points": false,
+ "renderer": "flot",
+ "seriesOverrides": [
+ {
+ "$$hashKey": "object:6118",
+ "alias": "Pgfault - Page major and minor fault operations",
+ "fill": 0,
+ "stack": false
+ }
+ ],
+ "spaceLength": 10,
+ "stack": true,
+ "steppedLine": false,
+ "targets": [
+ {
+ "expr": "irate(node_vmstat_pgfault{instance=\"$node\",job=\"$job\"}[5m])",
+ "format": "time_series",
+ "intervalFactor": 2,
+ "legendFormat": "Pgfault - Page major and minor fault operations",
+ "refId": "A",
+ "step": 4
+ },
+ {
+ "expr": "irate(node_vmstat_pgmajfault{instance=\"$node\",job=\"$job\"}[5m])",
+ "format": "time_series",
+ "intervalFactor": 2,
+ "legendFormat": "Pgmajfault - Major page fault operations",
+ "refId": "B",
+ "step": 4
+ },
+ {
+ "expr": "irate(node_vmstat_pgfault{instance=\"$node\",job=\"$job\"}[5m]) - irate(node_vmstat_pgmajfault{instance=\"$node\",job=\"$job\"}[5m])",
+ "format": "time_series",
+ "intervalFactor": 2,
+ "legendFormat": "Pgminfault - Minor page fault operations",
+ "refId": "C",
+ "step": 4
+ }
+ ],
+ "thresholds": [],
+ "timeFrom": null,
+ "timeRegions": [],
+ "timeShift": null,
+ "title": "Memory Page Faults",
+ "tooltip": {
+ "shared": true,
+ "sort": 0,
+ "value_type": "cumulative"
+ },
+ "type": "graph",
+ "xaxis": {
+ "buckets": null,
+ "mode": "time",
+ "name": null,
+ "show": true,
+ "values": []
+ },
+ "yaxes": [
+ {
+ "$$hashKey": "object:6133",
+ "format": "short",
+ "label": "faults",
+ "logBase": 1,
+ "max": null,
+ "min": "0",
+ "show": true
+ },
+ {
+ "$$hashKey": "object:6134",
+ "format": "short",
+ "label": null,
+ "logBase": 1,
+ "max": null,
+ "min": null,
+ "show": false
+ }
+ ],
+ "yaxis": {
+ "align": false,
+ "alignLevel": null
+ }
+ },
+ {
+ "aliasColors": {
+ "Active": "#99440A",
+ "Buffers": "#58140C",
+ "Cache": "#6D1F62",
+ "Cached": "#511749",
+ "Committed": "#508642",
+ "Dirty": "#6ED0E0",
+ "Free": "#B7DBAB",
+ "Inactive": "#EA6460",
+ "Mapped": "#052B51",
+ "PageTables": "#0A50A1",
+ "Page_Tables": "#0A50A1",
+ "Slab_Cache": "#EAB839",
+ "Swap": "#BF1B00",
+ "Swap_Cache": "#C15C17",
+ "Total": "#511749",
+ "Total RAM": "#052B51",
+ "Total RAM + Swap": "#052B51",
+ "Total Swap": "#614D93",
+ "VmallocUsed": "#EA6460"
+ },
+ "bars": false,
+ "dashLength": 10,
+ "dashes": false,
+ "datasource": "${DS_PROMETHEUS}",
+ "decimals": 2,
+ "fill": 2,
+ "fillGradient": 0,
+ "gridPos": {
+ "h": 10,
+ "w": 12,
+ "x": 12,
+ "y": 33
+ },
+ "hiddenSeries": false,
+ "id": 307,
+ "legend": {
+ "alignAsTable": true,
+ "avg": true,
+ "current": true,
+ "max": true,
+ "min": true,
+ "rightSide": false,
+ "show": true,
+ "sideWidth": null,
+ "total": false,
+ "values": true
+ },
+ "lines": true,
+ "linewidth": 1,
+ "links": [],
+ "maxPerRow": 6,
+ "nullPointMode": "null",
+ "options": {
+ "dataLinks": []
+ },
+ "percentage": false,
+ "pointradius": 5,
+ "points": false,
+ "renderer": "flot",
+ "seriesOverrides": [],
+ "spaceLength": 10,
+ "stack": false,
+ "steppedLine": false,
+ "targets": [
+ {
+ "expr": "irate(node_vmstat_oom_kill{instance=\"$node\",job=\"$job\"}[5m])",
+ "format": "time_series",
+ "interval": "",
+ "intervalFactor": 2,
+ "legendFormat": "oom killer invocations ",
+ "refId": "A",
+ "step": 4
+ }
+ ],
+ "thresholds": [],
+ "timeFrom": null,
+ "timeRegions": [],
+ "timeShift": null,
+ "title": "OOM Killer",
+ "tooltip": {
+ "shared": true,
+ "sort": 0,
+ "value_type": "individual"
+ },
+ "type": "graph",
+ "xaxis": {
+ "buckets": null,
+ "mode": "time",
+ "name": null,
+ "show": true,
+ "values": []
+ },
+ "yaxes": [
+ {
+ "$$hashKey": "object:5373",
+ "format": "short",
+ "label": "counter",
+ "logBase": 1,
+ "max": null,
+ "min": "0",
+ "show": true
+ },
+ {
+ "$$hashKey": "object:5374",
+ "format": "short",
+ "label": null,
+ "logBase": 1,
+ "max": null,
+ "min": null,
+ "show": false
+ }
+ ],
+ "yaxis": {
+ "align": false,
+ "alignLevel": null
+ }
+ }
+ ],
+ "repeat": null,
+ "title": "Memory Vmstat",
+ "type": "row"
+ },
+ {
+ "collapsed": true,
+ "datasource": "${DS_PROMETHEUS}",
+ "gridPos": {
+ "h": 1,
+ "w": 24,
+ "x": 0,
+ "y": 23
+ },
+ "id": 293,
+ "panels": [
+ {
+ "aliasColors": {},
+ "bars": false,
+ "dashLength": 10,
+ "dashes": false,
+ "datasource": "${DS_PROMETHEUS}",
+ "description": "",
+ "fill": 2,
+ "fillGradient": 0,
+ "gridPos": {
+ "h": 10,
+ "w": 12,
+ "x": 0,
+ "y": 24
+ },
+ "hiddenSeries": false,
+ "id": 260,
+ "legend": {
+ "alignAsTable": true,
+ "avg": true,
+ "current": true,
+ "max": true,
+ "min": true,
+ "show": true,
+ "total": false,
+ "values": true
+ },
+ "lines": true,
+ "linewidth": 1,
+ "links": [],
+ "nullPointMode": "null",
+ "options": {
+ "dataLinks": []
+ },
+ "percentage": false,
+ "pointradius": 5,
+ "points": false,
+ "renderer": "flot",
+ "seriesOverrides": [
+ {
+ "alias": "/.*Variation*./",
+ "color": "#890F02"
+ }
+ ],
+ "spaceLength": 10,
+ "stack": false,
+ "steppedLine": false,
+ "targets": [
+ {
+ "expr": "node_timex_estimated_error_seconds{instance=\"$node\",job=\"$job\"}",
+ "format": "time_series",
+ "hide": false,
+ "interval": "",
+ "intervalFactor": 2,
+ "legendFormat": "Estimated error in seconds",
+ "refId": "A",
+ "step": 240
+ },
+ {
+ "expr": "node_timex_offset_seconds{instance=\"$node\",job=\"$job\"}",
+ "format": "time_series",
+ "hide": false,
+ "interval": "",
+ "intervalFactor": 2,
+ "legendFormat": "Time offset in between local system and reference clock",
+ "refId": "B",
+ "step": 240
+ },
+ {
+ "expr": "node_timex_maxerror_seconds{instance=\"$node\",job=\"$job\"}",
+ "format": "time_series",
+ "hide": false,
+ "interval": "",
+ "intervalFactor": 2,
+ "legendFormat": "Maximum error in seconds",
+ "refId": "C",
+ "step": 240
+ }
+ ],
+ "thresholds": [],
+ "timeFrom": null,
+ "timeRegions": [],
+ "timeShift": null,
+ "title": "Time Syncronized Drift",
+ "tooltip": {
+ "shared": true,
+ "sort": 0,
+ "value_type": "individual"
+ },
+ "type": "graph",
+ "xaxis": {
+ "buckets": null,
+ "mode": "time",
+ "name": null,
+ "show": true,
+ "values": []
+ },
+ "yaxes": [
+ {
+ "format": "s",
+ "label": "seconds",
+ "logBase": 1,
+ "max": null,
+ "min": null,
+ "show": true
+ },
+ {
+ "format": "short",
+ "label": "counter",
+ "logBase": 1,
+ "max": null,
+ "min": null,
+ "show": false
+ }
+ ],
+ "yaxis": {
+ "align": false,
+ "alignLevel": null
+ }
+ },
+ {
+ "aliasColors": {},
+ "bars": false,
+ "dashLength": 10,
+ "dashes": false,
+ "datasource": "${DS_PROMETHEUS}",
+ "description": "",
+ "fill": 2,
+ "fillGradient": 0,
+ "gridPos": {
+ "h": 10,
+ "w": 12,
+ "x": 12,
+ "y": 24
+ },
+ "hiddenSeries": false,
+ "id": 291,
+ "legend": {
+ "alignAsTable": true,
+ "avg": true,
+ "current": true,
+ "max": true,
+ "min": true,
+ "show": true,
+ "total": false,
+ "values": true
+ },
+ "lines": true,
+ "linewidth": 1,
+ "links": [],
+ "nullPointMode": "null",
+ "options": {
+ "dataLinks": []
+ },
+ "percentage": false,
+ "pointradius": 5,
+ "points": false,
+ "renderer": "flot",
+ "seriesOverrides": [],
+ "spaceLength": 10,
+ "stack": false,
+ "steppedLine": false,
+ "targets": [
+ {
+ "expr": "node_timex_loop_time_constant{instance=\"$node\",job=\"$job\"}",
+ "format": "time_series",
+ "interval": "",
+ "intervalFactor": 2,
+ "legendFormat": "Phase-locked loop time adjust",
+ "refId": "A",
+ "step": 240
+ }
+ ],
+ "thresholds": [],
+ "timeFrom": null,
+ "timeRegions": [],
+ "timeShift": null,
+ "title": "Time PLL Adjust",
+ "tooltip": {
+ "shared": true,
+ "sort": 0,
+ "value_type": "individual"
+ },
+ "type": "graph",
+ "xaxis": {
+ "buckets": null,
+ "mode": "time",
+ "name": null,
+ "show": true,
+ "values": []
+ },
+ "yaxes": [
+ {
+ "format": "short",
+ "label": "counter",
+ "logBase": 1,
+ "max": null,
+ "min": null,
+ "show": true
+ },
+ {
+ "format": "short",
+ "label": null,
+ "logBase": 1,
+ "max": null,
+ "min": null,
+ "show": false
+ }
+ ],
+ "yaxis": {
+ "align": false,
+ "alignLevel": null
+ }
+ },
+ {
+ "aliasColors": {},
+ "bars": false,
+ "dashLength": 10,
+ "dashes": false,
+ "datasource": "${DS_PROMETHEUS}",
+ "description": "",
+ "fill": 2,
+ "fillGradient": 0,
+ "gridPos": {
+ "h": 10,
+ "w": 12,
+ "x": 0,
+ "y": 34
+ },
+ "hiddenSeries": false,
+ "id": 168,
+ "legend": {
+ "alignAsTable": true,
+ "avg": true,
+ "current": true,
+ "max": true,
+ "min": true,
+ "show": true,
+ "total": false,
+ "values": true
+ },
+ "lines": true,
+ "linewidth": 1,
+ "links": [],
+ "nullPointMode": "null",
+ "options": {
+ "dataLinks": []
+ },
+ "percentage": false,
+ "pointradius": 5,
+ "points": false,
+ "renderer": "flot",
+ "seriesOverrides": [
+ {
+ "alias": "/.*Variation*./",
+ "color": "#890F02"
+ }
+ ],
+ "spaceLength": 10,
+ "stack": false,
+ "steppedLine": false,
+ "targets": [
+ {
+ "expr": "node_timex_sync_status{instance=\"$node\",job=\"$job\"}",
+ "format": "time_series",
+ "interval": "",
+ "intervalFactor": 2,
+ "legendFormat": "Is clock synchronized to a reliable server (1 = yes, 0 = no)",
+ "refId": "A",
+ "step": 240
+ },
+ {
+ "expr": "node_timex_frequency_adjustment_ratio{instance=\"$node\",job=\"$job\"}",
+ "format": "time_series",
+ "interval": "",
+ "intervalFactor": 2,
+ "legendFormat": "Local clock frequency adjustment",
+ "refId": "B",
+ "step": 240
+ }
+ ],
+ "thresholds": [],
+ "timeFrom": null,
+ "timeRegions": [],
+ "timeShift": null,
+ "title": "Time Syncronized Status",
+ "tooltip": {
+ "shared": true,
+ "sort": 0,
+ "value_type": "individual"
+ },
+ "type": "graph",
+ "xaxis": {
+ "buckets": null,
+ "mode": "time",
+ "name": null,
+ "show": true,
+ "values": []
+ },
+ "yaxes": [
+ {
+ "format": "short",
+ "label": "counter",
+ "logBase": 1,
+ "max": null,
+ "min": null,
+ "show": true
+ },
+ {
+ "format": "short",
+ "label": null,
+ "logBase": 1,
+ "max": null,
+ "min": null,
+ "show": false
+ }
+ ],
+ "yaxis": {
+ "align": false,
+ "alignLevel": null
+ }
+ },
+ {
+ "aliasColors": {},
+ "bars": false,
+ "dashLength": 10,
+ "dashes": false,
+ "datasource": "${DS_PROMETHEUS}",
+ "description": "",
+ "fill": 2,
+ "fillGradient": 0,
+ "gridPos": {
+ "h": 10,
+ "w": 12,
+ "x": 12,
+ "y": 34
+ },
+ "hiddenSeries": false,
+ "id": 294,
+ "legend": {
+ "alignAsTable": true,
+ "avg": true,
+ "current": true,
+ "max": true,
+ "min": true,
+ "show": true,
+ "total": false,
+ "values": true
+ },
+ "lines": true,
+ "linewidth": 1,
+ "links": [],
+ "nullPointMode": "null",
+ "options": {
+ "dataLinks": []
+ },
+ "percentage": false,
+ "pointradius": 5,
+ "points": false,
+ "renderer": "flot",
+ "seriesOverrides": [],
+ "spaceLength": 10,
+ "stack": false,
+ "steppedLine": false,
+ "targets": [
+ {
+ "expr": "node_timex_tick_seconds{instance=\"$node\",job=\"$job\"}",
+ "format": "time_series",
+ "interval": "",
+ "intervalFactor": 2,
+ "legendFormat": "Seconds between clock ticks",
+ "refId": "A",
+ "step": 240
+ },
+ {
+ "expr": "node_timex_tai_offset_seconds{instance=\"$node\",job=\"$job\"}",
+ "format": "time_series",
+ "interval": "",
+ "intervalFactor": 2,
+ "legendFormat": "International Atomic Time (TAI) offset",
+ "refId": "B",
+ "step": 240
+ }
+ ],
+ "thresholds": [],
+ "timeFrom": null,
+ "timeRegions": [],
+ "timeShift": null,
+ "title": "Time Misc",
+ "tooltip": {
+ "shared": true,
+ "sort": 0,
+ "value_type": "individual"
+ },
+ "type": "graph",
+ "xaxis": {
+ "buckets": null,
+ "mode": "time",
+ "name": null,
+ "show": true,
+ "values": []
+ },
+ "yaxes": [
+ {
+ "format": "s",
+ "label": "seconds",
+ "logBase": 1,
+ "max": null,
+ "min": null,
+ "show": true
+ },
+ {
+ "format": "short",
+ "label": null,
+ "logBase": 1,
+ "max": null,
+ "min": null,
+ "show": false
+ }
+ ],
+ "yaxis": {
+ "align": false,
+ "alignLevel": null
+ }
+ }
+ ],
+ "title": "System Timesync",
+ "type": "row"
+ },
+ {
+ "collapsed": true,
+ "datasource": "${DS_PROMETHEUS}",
+ "gridPos": {
+ "h": 1,
+ "w": 24,
+ "x": 0,
+ "y": 24
+ },
+ "id": 312,
+ "panels": [
+ {
+ "aliasColors": {},
+ "bars": false,
+ "dashLength": 10,
+ "dashes": false,
+ "datasource": "${DS_PROMETHEUS}",
+ "fill": 2,
+ "fillGradient": 0,
+ "gridPos": {
+ "h": 10,
+ "w": 12,
+ "x": 0,
+ "y": 7
+ },
+ "hiddenSeries": false,
+ "id": 62,
+ "legend": {
+ "alignAsTable": true,
+ "avg": true,
+ "current": true,
+ "max": true,
+ "min": true,
+ "show": true,
+ "total": false,
+ "values": true
+ },
+ "lines": true,
+ "linewidth": 1,
+ "links": [],
+ "maxPerRow": 6,
+ "nullPointMode": "null",
+ "options": {
+ "dataLinks": []
+ },
+ "percentage": false,
+ "pointradius": 5,
+ "points": false,
+ "renderer": "flot",
+ "seriesOverrides": [],
+ "spaceLength": 10,
+ "stack": false,
+ "steppedLine": false,
+ "targets": [
+ {
+ "expr": "node_procs_blocked{instance=\"$node\",job=\"$job\"}",
+ "format": "time_series",
+ "intervalFactor": 2,
+ "legendFormat": "Processes blocked waiting for I/O to complete",
+ "refId": "A",
+ "step": 240
+ },
+ {
+ "expr": "node_procs_running{instance=\"$node\",job=\"$job\"}",
+ "format": "time_series",
+ "intervalFactor": 2,
+ "legendFormat": "Processes in runnable state",
+ "refId": "B",
+ "step": 240
+ }
+ ],
+ "thresholds": [],
+ "timeFrom": null,
+ "timeRegions": [],
+ "timeShift": null,
+ "title": "Processes Status",
+ "tooltip": {
+ "shared": true,
+ "sort": 0,
+ "value_type": "individual"
+ },
+ "type": "graph",
+ "xaxis": {
+ "buckets": null,
+ "mode": "time",
+ "name": null,
+ "show": true,
+ "values": []
+ },
+ "yaxes": [
+ {
+ "$$hashKey": "object:6500",
+ "format": "short",
+ "label": "counter",
+ "logBase": 1,
+ "max": null,
+ "min": "0",
+ "show": true
+ },
+ {
+ "$$hashKey": "object:6501",
+ "format": "short",
+ "label": null,
+ "logBase": 1,
+ "max": null,
+ "min": null,
+ "show": false
+ }
+ ],
+ "yaxis": {
+ "align": false,
+ "alignLevel": null
+ }
+ },
+ {
+ "aliasColors": {},
+ "bars": false,
+ "dashLength": 10,
+ "dashes": false,
+ "datasource": "${DS_PROMETHEUS}",
+ "fill": 2,
+ "fillGradient": 0,
+ "gridPos": {
+ "h": 10,
+ "w": 12,
+ "x": 12,
+ "y": 7
+ },
+ "hiddenSeries": false,
+ "id": 315,
+ "legend": {
+ "alignAsTable": true,
+ "avg": true,
+ "current": true,
+ "max": true,
+ "min": true,
+ "show": true,
+ "total": false,
+ "values": true
+ },
+ "lines": true,
+ "linewidth": 1,
+ "links": [],
+ "maxPerRow": 6,
+ "nullPointMode": "null",
+ "options": {
+ "dataLinks": []
+ },
+ "percentage": false,
+ "pointradius": 5,
+ "points": false,
+ "renderer": "flot",
+ "seriesOverrides": [],
+ "spaceLength": 10,
+ "stack": true,
+ "steppedLine": false,
+ "targets": [
+ {
+ "expr": "node_processes_state{instance=\"$node\",job=\"$job\"}",
+ "format": "time_series",
+ "interval": "",
+ "intervalFactor": 2,
+ "legendFormat": "{{ state }}",
+ "refId": "A",
+ "step": 240
+ }
+ ],
+ "thresholds": [],
+ "timeFrom": null,
+ "timeRegions": [],
+ "timeShift": null,
+ "title": "Processes State",
+ "tooltip": {
+ "shared": true,
+ "sort": 0,
+ "value_type": "individual"
+ },
+ "type": "graph",
+ "xaxis": {
+ "buckets": null,
+ "mode": "time",
+ "name": null,
+ "show": true,
+ "values": []
+ },
+ "yaxes": [
+ {
+ "$$hashKey": "object:6500",
+ "format": "short",
+ "label": "counter",
+ "logBase": 1,
+ "max": null,
+ "min": "0",
+ "show": true
+ },
+ {
+ "$$hashKey": "object:6501",
+ "format": "short",
+ "label": null,
+ "logBase": 1,
+ "max": null,
+ "min": null,
+ "show": false
+ }
+ ],
+ "yaxis": {
+ "align": false,
+ "alignLevel": null
+ }
+ },
+ {
+ "aliasColors": {},
+ "bars": false,
+ "dashLength": 10,
+ "dashes": false,
+ "datasource": "${DS_PROMETHEUS}",
+ "fill": 2,
+ "fillGradient": 0,
+ "gridPos": {
+ "h": 10,
+ "w": 12,
+ "x": 0,
+ "y": 17
+ },
+ "hiddenSeries": false,
+ "id": 148,
+ "legend": {
+ "alignAsTable": true,
+ "avg": true,
+ "current": true,
+ "max": true,
+ "min": true,
+ "show": true,
+ "total": false,
+ "values": true
+ },
+ "lines": true,
+ "linewidth": 1,
+ "links": [],
+ "maxPerRow": 6,
+ "nullPointMode": "null",
+ "options": {
+ "dataLinks": []
+ },
+ "percentage": false,
+ "pointradius": 5,
+ "points": false,
+ "renderer": "flot",
+ "seriesOverrides": [],
+ "spaceLength": 10,
+ "stack": false,
+ "steppedLine": false,
+ "targets": [
+ {
+ "expr": "irate(node_forks_total{instance=\"$node\",job=\"$job\"}[5m])",
+ "format": "time_series",
+ "hide": false,
+ "intervalFactor": 2,
+ "legendFormat": "Processes forks second",
+ "refId": "A",
+ "step": 240
+ }
+ ],
+ "thresholds": [],
+ "timeFrom": null,
+ "timeRegions": [],
+ "timeShift": null,
+ "title": "Processes Forks",
+ "tooltip": {
+ "shared": true,
+ "sort": 0,
+ "value_type": "individual"
+ },
+ "type": "graph",
+ "xaxis": {
+ "buckets": null,
+ "mode": "time",
+ "name": null,
+ "show": true,
+ "values": []
+ },
+ "yaxes": [
+ {
+ "$$hashKey": "object:6640",
+ "format": "short",
+ "label": "forks / sec",
+ "logBase": 1,
+ "max": null,
+ "min": "0",
+ "show": true
+ },
+ {
+ "$$hashKey": "object:6641",
+ "format": "short",
+ "label": null,
+ "logBase": 1,
+ "max": null,
+ "min": null,
+ "show": false
+ }
+ ],
+ "yaxis": {
+ "align": false,
+ "alignLevel": null
+ }
+ },
+ {
+ "aliasColors": {},
+ "bars": false,
+ "dashLength": 10,
+ "dashes": false,
+ "datasource": "${DS_PROMETHEUS}",
+ "fill": 2,
+ "fillGradient": 0,
+ "gridPos": {
+ "h": 10,
+ "w": 12,
+ "x": 12,
+ "y": 17
+ },
+ "hiddenSeries": false,
+ "id": 149,
+ "legend": {
+ "alignAsTable": true,
+ "avg": true,
+ "current": true,
+ "max": true,
+ "min": true,
+ "show": true,
+ "total": false,
+ "values": true
+ },
+ "lines": true,
+ "linewidth": 1,
+ "links": [],
+ "nullPointMode": "null",
+ "options": {
+ "dataLinks": []
+ },
+ "percentage": false,
+ "pointradius": 5,
+ "points": false,
+ "renderer": "flot",
+ "seriesOverrides": [
+ {
+ "alias": "/.*Max.*/",
+ "fill": 0
+ }
+ ],
+ "spaceLength": 10,
+ "stack": false,
+ "steppedLine": false,
+ "targets": [
+ {
+ "expr": "irate(process_virtual_memory_bytes{instance=\"$node\",job=\"$job\"}[5m])",
+ "hide": false,
+ "interval": "",
+ "intervalFactor": 2,
+ "legendFormat": "Processes virtual memory size in bytes",
+ "refId": "A",
+ "step": 240
+ },
+ {
+ "expr": "process_resident_memory_max_bytes{instance=\"$node\",job=\"$job\"}",
+ "hide": false,
+ "interval": "",
+ "intervalFactor": 2,
+ "legendFormat": "Maximum amount of virtual memory available in bytes",
+ "refId": "B",
+ "step": 240
+ },
+ {
+ "expr": "irate(process_virtual_memory_bytes{instance=\"$node\",job=\"$job\"}[5m])",
+ "hide": false,
+ "interval": "",
+ "intervalFactor": 2,
+ "legendFormat": "Processes virtual memory size in bytes",
+ "refId": "C",
+ "step": 240
+ },
+ {
+ "expr": "irate(process_virtual_memory_max_bytes{instance=\"$node\",job=\"$job\"}[5m])",
+ "hide": false,
+ "interval": "",
+ "intervalFactor": 2,
+ "legendFormat": "Maximum amount of virtual memory available in bytes",
+ "refId": "D",
+ "step": 240
+ }
+ ],
+ "thresholds": [],
+ "timeFrom": null,
+ "timeRegions": [],
+ "timeShift": null,
+ "title": "Processes Memory",
+ "tooltip": {
+ "shared": true,
+ "sort": 0,
+ "value_type": "individual"
+ },
+ "type": "graph",
+ "xaxis": {
+ "buckets": null,
+ "mode": "time",
+ "name": null,
+ "show": true,
+ "values": []
+ },
+ "yaxes": [
+ {
+ "format": "decbytes",
+ "label": "bytes",
+ "logBase": 1,
+ "max": null,
+ "min": "0",
+ "show": true
+ },
+ {
+ "format": "short",
+ "label": null,
+ "logBase": 1,
+ "max": null,
+ "min": null,
+ "show": false
+ }
+ ],
+ "yaxis": {
+ "align": false,
+ "alignLevel": null
+ }
+ },
+ {
+ "aliasColors": {},
+ "bars": false,
+ "dashLength": 10,
+ "dashes": false,
+ "datasource": "${DS_PROMETHEUS}",
+ "fill": 2,
+ "fillGradient": 0,
+ "gridPos": {
+ "h": 10,
+ "w": 12,
+ "x": 0,
+ "y": 27
+ },
+ "hiddenSeries": false,
+ "id": 313,
+ "legend": {
+ "alignAsTable": true,
+ "avg": true,
+ "current": true,
+ "max": true,
+ "min": true,
+ "show": true,
+ "total": false,
+ "values": true
+ },
+ "lines": true,
+ "linewidth": 1,
+ "links": [],
+ "maxPerRow": 6,
+ "nullPointMode": "null",
+ "options": {
+ "dataLinks": []
+ },
+ "percentage": false,
+ "pointradius": 5,
+ "points": false,
+ "renderer": "flot",
+ "seriesOverrides": [
+ {
+ "$$hashKey": "object:709",
+ "alias": "PIDs limit",
+ "color": "#F2495C",
+ "fill": 0
+ }
+ ],
+ "spaceLength": 10,
+ "stack": false,
+ "steppedLine": false,
+ "targets": [
+ {
+ "expr": "node_processes_pids{instance=\"$node\",job=\"$job\"}",
+ "format": "time_series",
+ "interval": "",
+ "intervalFactor": 2,
+ "legendFormat": "Number of PIDs",
+ "refId": "A",
+ "step": 240
+ },
+ {
+ "expr": "node_processes_max_processes{instance=\"$node\",job=\"$job\"}",
+ "format": "time_series",
+ "interval": "",
+ "intervalFactor": 2,
+ "legendFormat": "PIDs limit",
+ "refId": "B",
+ "step": 240
+ }
+ ],
+ "thresholds": [],
+ "timeFrom": null,
+ "timeRegions": [],
+ "timeShift": null,
+ "title": "PIDs Number and Limit",
+ "tooltip": {
+ "shared": true,
+ "sort": 0,
+ "value_type": "individual"
+ },
+ "type": "graph",
+ "xaxis": {
+ "buckets": null,
+ "mode": "time",
+ "name": null,
+ "show": true,
+ "values": []
+ },
+ "yaxes": [
+ {
+ "$$hashKey": "object:6500",
+ "format": "short",
+ "label": "counter",
+ "logBase": 1,
+ "max": null,
+ "min": "0",
+ "show": true
+ },
+ {
+ "$$hashKey": "object:6501",
+ "format": "short",
+ "label": null,
+ "logBase": 1,
+ "max": null,
+ "min": null,
+ "show": false
+ }
+ ],
+ "yaxis": {
+ "align": false,
+ "alignLevel": null
+ }
+ },
+ {
+ "aliasColors": {},
+ "bars": false,
+ "dashLength": 10,
+ "dashes": false,
+ "datasource": "${DS_PROMETHEUS}",
+ "fill": 2,
+ "fillGradient": 0,
+ "gridPos": {
+ "h": 10,
+ "w": 12,
+ "x": 12,
+ "y": 27
+ },
+ "hiddenSeries": false,
+ "id": 305,
+ "legend": {
+ "alignAsTable": true,
+ "avg": true,
+ "current": true,
+ "max": true,
+ "min": true,
+ "show": true,
+ "total": false,
+ "values": true
+ },
+ "lines": true,
+ "linewidth": 1,
+ "links": [],
+ "maxPerRow": 6,
+ "nullPointMode": "null",
+ "options": {
+ "dataLinks": []
+ },
+ "percentage": false,
+ "pointradius": 5,
+ "points": false,
+ "renderer": "flot",
+ "seriesOverrides": [
+ {
+ "$$hashKey": "object:4963",
+ "alias": "/.*waiting.*/",
+ "transform": "negative-Y"
+ }
+ ],
+ "spaceLength": 10,
+ "stack": false,
+ "steppedLine": false,
+ "targets": [
+ {
+ "expr": "irate(node_schedstat_running_seconds_total{instance=\"$node\",job=\"$job\"}[5m])",
+ "format": "time_series",
+ "interval": "",
+ "intervalFactor": 2,
+ "legendFormat": "CPU {{ cpu }} - seconds spent running a process",
+ "refId": "A",
+ "step": 240
+ },
+ {
+ "expr": "irate(node_schedstat_waiting_seconds_total{instance=\"$node\",job=\"$job\"}[5m])",
+ "format": "time_series",
+ "interval": "",
+ "intervalFactor": 2,
+ "legendFormat": "CPU {{ cpu }} - seconds spent by processing waiting for this CPU",
+ "refId": "B",
+ "step": 240
+ }
+ ],
+ "thresholds": [],
+ "timeFrom": null,
+ "timeRegions": [],
+ "timeShift": null,
+ "title": "Process schedule stats Running / Waiting",
+ "tooltip": {
+ "shared": true,
+ "sort": 0,
+ "value_type": "individual"
+ },
+ "type": "graph",
+ "xaxis": {
+ "buckets": null,
+ "mode": "time",
+ "name": null,
+ "show": true,
+ "values": []
+ },
+ "yaxes": [
+ {
+ "$$hashKey": "object:4860",
+ "format": "s",
+ "label": "seconds",
+ "logBase": 1,
+ "max": null,
+ "min": null,
+ "show": true
+ },
+ {
+ "$$hashKey": "object:4861",
+ "format": "short",
+ "label": null,
+ "logBase": 1,
+ "max": null,
+ "min": null,
+ "show": false
+ }
+ ],
+ "yaxis": {
+ "align": false,
+ "alignLevel": null
+ }
+ },
+ {
+ "aliasColors": {},
+ "bars": false,
+ "dashLength": 10,
+ "dashes": false,
+ "datasource": "${DS_PROMETHEUS}",
+ "fill": 2,
+ "fillGradient": 0,
+ "gridPos": {
+ "h": 10,
+ "w": 12,
+ "x": 12,
+ "y": 37
+ },
+ "hiddenSeries": false,
+ "id": 314,
+ "legend": {
+ "alignAsTable": true,
+ "avg": true,
+ "current": true,
+ "max": true,
+ "min": true,
+ "show": true,
+ "total": false,
+ "values": true
+ },
+ "lines": true,
+ "linewidth": 1,
+ "links": [],
+ "maxPerRow": 6,
+ "nullPointMode": "null",
+ "options": {
+ "dataLinks": []
+ },
+ "percentage": false,
+ "pointradius": 5,
+ "points": false,
+ "renderer": "flot",
+ "seriesOverrides": [
+ {
+ "$$hashKey": "object:709",
+ "alias": "Threads limit",
+ "color": "#F2495C",
+ "fill": 0
+ }
+ ],
+ "spaceLength": 10,
+ "stack": false,
+ "steppedLine": false,
+ "targets": [
+ {
+ "expr": "node_processes_threads{instance=\"$node\",job=\"$job\"}",
+ "format": "time_series",
+ "interval": "",
+ "intervalFactor": 2,
+ "legendFormat": "Allocated threads",
+ "refId": "A",
+ "step": 240
+ },
+ {
+ "expr": "node_processes_max_threads{instance=\"$node\",job=\"$job\"}",
+ "format": "time_series",
+ "interval": "",
+ "intervalFactor": 2,
+ "legendFormat": "Threads limit",
+ "refId": "B",
+ "step": 240
+ }
+ ],
+ "thresholds": [],
+ "timeFrom": null,
+ "timeRegions": [],
+ "timeShift": null,
+ "title": "Threads Number and Limit",
+ "tooltip": {
+ "shared": true,
+ "sort": 0,
+ "value_type": "individual"
+ },
+ "type": "graph",
+ "xaxis": {
+ "buckets": null,
+ "mode": "time",
+ "name": null,
+ "show": true,
+ "values": []
+ },
+ "yaxes": [
+ {
+ "$$hashKey": "object:6500",
+ "format": "short",
+ "label": "counter",
+ "logBase": 1,
+ "max": null,
+ "min": "0",
+ "show": true
+ },
+ {
+ "$$hashKey": "object:6501",
+ "format": "short",
+ "label": null,
+ "logBase": 1,
+ "max": null,
+ "min": null,
+ "show": false
+ }
+ ],
+ "yaxis": {
+ "align": false,
+ "alignLevel": null
+ }
+ }
+ ],
+ "title": "System Processes",
+ "type": "row"
+ },
+ {
+ "collapsed": true,
+ "datasource": "${DS_PROMETHEUS}",
+ "gridPos": {
+ "h": 1,
+ "w": 24,
+ "x": 0,
+ "y": 25
+ },
+ "id": 269,
+ "panels": [
+ {
+ "aliasColors": {},
+ "bars": false,
+ "dashLength": 10,
+ "dashes": false,
+ "datasource": "${DS_PROMETHEUS}",
+ "fill": 2,
+ "fillGradient": 0,
+ "gridPos": {
+ "h": 10,
+ "w": 12,
+ "x": 0,
+ "y": 8
+ },
+ "hiddenSeries": false,
+ "id": 8,
+ "legend": {
+ "alignAsTable": true,
+ "avg": true,
+ "current": true,
+ "max": true,
+ "min": true,
+ "show": true,
+ "total": false,
+ "values": true
+ },
+ "lines": true,
+ "linewidth": 1,
+ "links": [],
+ "maxPerRow": 6,
+ "nullPointMode": "null",
+ "options": {
+ "dataLinks": []
+ },
+ "percentage": false,
+ "pointradius": 5,
+ "points": false,
+ "renderer": "flot",
+ "repeat": null,
+ "seriesOverrides": [],
+ "spaceLength": 10,
+ "stack": false,
+ "steppedLine": false,
+ "targets": [
+ {
+ "expr": "irate(node_context_switches_total{instance=\"$node\",job=\"$job\"}[5m])",
+ "format": "time_series",
+ "intervalFactor": 2,
+ "legendFormat": "Context switches",
+ "refId": "A",
+ "step": 240
+ },
+ {
+ "expr": "irate(node_intr_total{instance=\"$node\",job=\"$job\"}[5m])",
+ "format": "time_series",
+ "hide": false,
+ "intervalFactor": 2,
+ "legendFormat": "Interrupts",
+ "refId": "B",
+ "step": 240
+ }
+ ],
+ "thresholds": [],
+ "timeFrom": null,
+ "timeRegions": [],
+ "timeShift": null,
+ "title": "Context Switches / Interrupts",
+ "tooltip": {
+ "shared": true,
+ "sort": 0,
+ "value_type": "individual"
+ },
+ "type": "graph",
+ "xaxis": {
+ "buckets": null,
+ "mode": "time",
+ "name": null,
+ "show": true,
+ "values": []
+ },
+ "yaxes": [
+ {
+ "format": "short",
+ "label": "counter",
+ "logBase": 1,
+ "max": null,
+ "min": "0",
+ "show": true
+ },
+ {
+ "format": "short",
+ "label": null,
+ "logBase": 1,
+ "max": null,
+ "min": null,
+ "show": false
+ }
+ ],
+ "yaxis": {
+ "align": false,
+ "alignLevel": null
+ }
+ },
+ {
+ "aliasColors": {},
+ "bars": false,
+ "dashLength": 10,
+ "dashes": false,
+ "datasource": "${DS_PROMETHEUS}",
+ "fill": 2,
+ "fillGradient": 0,
+ "gridPos": {
+ "h": 10,
+ "w": 12,
+ "x": 12,
+ "y": 8
+ },
+ "hiddenSeries": false,
+ "id": 7,
+ "legend": {
+ "alignAsTable": true,
+ "avg": true,
+ "current": true,
+ "max": true,
+ "min": true,
+ "show": true,
+ "total": false,
+ "values": true
+ },
+ "lines": true,
+ "linewidth": 1,
+ "links": [],
+ "maxPerRow": 6,
+ "nullPointMode": "null",
+ "options": {
+ "dataLinks": []
+ },
+ "percentage": false,
+ "pointradius": 5,
+ "points": false,
+ "renderer": "flot",
+ "repeat": null,
+ "seriesOverrides": [],
+ "spaceLength": 10,
+ "stack": false,
+ "steppedLine": false,
+ "targets": [
+ {
+ "expr": "node_load1{instance=\"$node\",job=\"$job\"}",
+ "format": "time_series",
+ "intervalFactor": 4,
+ "legendFormat": "Load 1m",
+ "refId": "A",
+ "step": 480
+ },
+ {
+ "expr": "node_load5{instance=\"$node\",job=\"$job\"}",
+ "format": "time_series",
+ "intervalFactor": 4,
+ "legendFormat": "Load 5m",
+ "refId": "B",
+ "step": 480
+ },
+ {
+ "expr": "node_load15{instance=\"$node\",job=\"$job\"}",
+ "format": "time_series",
+ "intervalFactor": 4,
+ "legendFormat": "Load 15m",
+ "refId": "C",
+ "step": 480
+ }
+ ],
+ "thresholds": [],
+ "timeFrom": null,
+ "timeRegions": [],
+ "timeShift": null,
+ "title": "System Load",
+ "tooltip": {
+ "shared": true,
+ "sort": 0,
+ "value_type": "individual"
+ },
+ "type": "graph",
+ "xaxis": {
+ "buckets": null,
+ "mode": "time",
+ "name": null,
+ "show": true,
+ "values": []
+ },
+ "yaxes": [
+ {
+ "$$hashKey": "object:6261",
+ "format": "short",
+ "label": "counter",
+ "logBase": 1,
+ "max": null,
+ "min": "0",
+ "show": true
+ },
+ {
+ "$$hashKey": "object:6262",
+ "format": "short",
+ "label": null,
+ "logBase": 1,
+ "max": null,
+ "min": null,
+ "show": false
+ }
+ ],
+ "yaxis": {
+ "align": false,
+ "alignLevel": null
+ }
+ },
+ {
+ "aliasColors": {},
+ "bars": false,
+ "dashLength": 10,
+ "dashes": false,
+ "datasource": "${DS_PROMETHEUS}",
+ "fill": 2,
+ "fillGradient": 0,
+ "gridPos": {
+ "h": 10,
+ "w": 12,
+ "x": 0,
+ "y": 18
+ },
+ "hiddenSeries": false,
+ "id": 259,
+ "legend": {
+ "alignAsTable": true,
+ "avg": true,
+ "current": true,
+ "max": true,
+ "min": true,
+ "show": true,
+ "total": false,
+ "values": true
+ },
+ "lines": true,
+ "linewidth": 1,
+ "links": [],
+ "nullPointMode": "null",
+ "options": {
+ "dataLinks": []
+ },
+ "percentage": false,
+ "pointradius": 5,
+ "points": false,
+ "renderer": "flot",
+ "seriesOverrides": [
+ {
+ "alias": "/.*Critical*./",
+ "color": "#E24D42",
+ "fill": 0
+ },
+ {
+ "alias": "/.*Max*./",
+ "color": "#EF843C",
+ "fill": 0
+ }
+ ],
+ "spaceLength": 10,
+ "stack": false,
+ "steppedLine": false,
+ "targets": [
+ {
+ "expr": "irate(node_interrupts_total{instance=\"$node\",job=\"$job\"}[5m])",
+ "format": "time_series",
+ "interval": "",
+ "intervalFactor": 2,
+ "legendFormat": "{{ type }} - {{ info }}",
+ "refId": "A",
+ "step": 240
+ }
+ ],
+ "thresholds": [],
+ "timeFrom": null,
+ "timeRegions": [],
+ "timeShift": null,
+ "title": "Interrupts Detail",
+ "tooltip": {
+ "shared": true,
+ "sort": 0,
+ "value_type": "individual"
+ },
+ "type": "graph",
+ "xaxis": {
+ "buckets": null,
+ "mode": "time",
+ "name": null,
+ "show": true,
+ "values": []
+ },
+ "yaxes": [
+ {
+ "format": "short",
+ "label": "counter",
+ "logBase": 1,
+ "max": null,
+ "min": "0",
+ "show": true
+ },
+ {
+ "format": "short",
+ "label": null,
+ "logBase": 1,
+ "max": null,
+ "min": null,
+ "show": false
+ }
+ ],
+ "yaxis": {
+ "align": false,
+ "alignLevel": null
+ }
+ },
+ {
+ "aliasColors": {},
+ "bars": false,
+ "dashLength": 10,
+ "dashes": false,
+ "datasource": "${DS_PROMETHEUS}",
+ "fill": 2,
+ "fillGradient": 0,
+ "gridPos": {
+ "h": 10,
+ "w": 12,
+ "x": 12,
+ "y": 18
+ },
+ "hiddenSeries": false,
+ "id": 306,
+ "legend": {
+ "alignAsTable": true,
+ "avg": true,
+ "current": true,
+ "max": true,
+ "min": true,
+ "show": true,
+ "total": false,
+ "values": true
+ },
+ "lines": true,
+ "linewidth": 1,
+ "links": [],
+ "maxPerRow": 6,
+ "nullPointMode": "null",
+ "options": {
+ "dataLinks": []
+ },
+ "percentage": false,
+ "pointradius": 5,
+ "points": false,
+ "renderer": "flot",
+ "seriesOverrides": [],
+ "spaceLength": 10,
+ "stack": false,
+ "steppedLine": false,
+ "targets": [
+ {
+ "expr": "irate(node_schedstat_timeslices_total{instance=\"$node\",job=\"$job\"}[5m])",
+ "format": "time_series",
+ "interval": "",
+ "intervalFactor": 2,
+ "legendFormat": "CPU {{ cpu }}",
+ "refId": "A",
+ "step": 240
+ }
+ ],
+ "thresholds": [],
+ "timeFrom": null,
+ "timeRegions": [],
+ "timeShift": null,
+ "title": "Schedule timeslices executed by each cpu",
+ "tooltip": {
+ "shared": true,
+ "sort": 0,
+ "value_type": "individual"
+ },
+ "type": "graph",
+ "xaxis": {
+ "buckets": null,
+ "mode": "time",
+ "name": null,
+ "show": true,
+ "values": []
+ },
+ "yaxes": [
+ {
+ "$$hashKey": "object:4860",
+ "format": "short",
+ "label": "counter",
+ "logBase": 1,
+ "max": null,
+ "min": null,
+ "show": true
+ },
+ {
+ "$$hashKey": "object:4861",
+ "format": "short",
+ "label": null,
+ "logBase": 1,
+ "max": null,
+ "min": null,
+ "show": false
+ }
+ ],
+ "yaxis": {
+ "align": false,
+ "alignLevel": null
+ }
+ },
+ {
+ "aliasColors": {},
+ "bars": false,
+ "dashLength": 10,
+ "dashes": false,
+ "datasource": "${DS_PROMETHEUS}",
+ "fill": 2,
+ "fillGradient": 0,
+ "gridPos": {
+ "h": 10,
+ "w": 12,
+ "x": 0,
+ "y": 28
+ },
+ "hiddenSeries": false,
+ "id": 151,
+ "legend": {
+ "alignAsTable": true,
+ "avg": true,
+ "current": true,
+ "max": true,
+ "min": true,
+ "show": true,
+ "total": false,
+ "values": true
+ },
+ "lines": true,
+ "linewidth": 1,
+ "links": [],
+ "maxPerRow": 6,
+ "nullPointMode": "null",
+ "options": {
+ "dataLinks": []
+ },
+ "percentage": false,
+ "pointradius": 5,
+ "points": false,
+ "renderer": "flot",
+ "seriesOverrides": [],
+ "spaceLength": 10,
+ "stack": false,
+ "steppedLine": false,
+ "targets": [
+ {
+ "expr": "node_entropy_available_bits{instance=\"$node\",job=\"$job\"}",
+ "format": "time_series",
+ "intervalFactor": 2,
+ "legendFormat": "Entropy available to random number generators",
+ "refId": "A",
+ "step": 240
+ }
+ ],
+ "thresholds": [],
+ "timeFrom": null,
+ "timeRegions": [],
+ "timeShift": null,
+ "title": "Entropy",
+ "tooltip": {
+ "shared": true,
+ "sort": 0,
+ "value_type": "individual"
+ },
+ "type": "graph",
+ "xaxis": {
+ "buckets": null,
+ "mode": "time",
+ "name": null,
+ "show": true,
+ "values": []
+ },
+ "yaxes": [
+ {
+ "$$hashKey": "object:6568",
+ "format": "short",
+ "label": "counter",
+ "logBase": 1,
+ "max": null,
+ "min": "0",
+ "show": true
+ },
+ {
+ "$$hashKey": "object:6569",
+ "format": "short",
+ "label": null,
+ "logBase": 1,
+ "max": null,
+ "min": null,
+ "show": false
+ }
+ ],
+ "yaxis": {
+ "align": false,
+ "alignLevel": null
+ }
+ },
+ {
+ "aliasColors": {},
+ "bars": false,
+ "dashLength": 10,
+ "dashes": false,
+ "datasource": "${DS_PROMETHEUS}",
+ "fill": 2,
+ "fillGradient": 0,
+ "gridPos": {
+ "h": 10,
+ "w": 12,
+ "x": 12,
+ "y": 28
+ },
+ "hiddenSeries": false,
+ "id": 308,
+ "legend": {
+ "alignAsTable": true,
+ "avg": true,
+ "current": true,
+ "max": true,
+ "min": true,
+ "show": true,
+ "total": false,
+ "values": true
+ },
+ "lines": true,
+ "linewidth": 1,
+ "links": [],
+ "maxPerRow": 6,
+ "nullPointMode": "null",
+ "options": {
+ "dataLinks": []
+ },
+ "percentage": false,
+ "pointradius": 5,
+ "points": false,
+ "renderer": "flot",
+ "seriesOverrides": [],
+ "spaceLength": 10,
+ "stack": false,
+ "steppedLine": false,
+ "targets": [
+ {
+ "expr": "irate(process_cpu_seconds_total{instance=\"$node\",job=\"$job\"}[5m])",
+ "format": "time_series",
+ "interval": "",
+ "intervalFactor": 2,
+ "legendFormat": "Time spent",
+ "refId": "A",
+ "step": 240
+ }
+ ],
+ "thresholds": [],
+ "timeFrom": null,
+ "timeRegions": [],
+ "timeShift": null,
+ "title": "CPU time spent in user and system contexts",
+ "tooltip": {
+ "shared": true,
+ "sort": 0,
+ "value_type": "individual"
+ },
+ "type": "graph",
+ "xaxis": {
+ "buckets": null,
+ "mode": "time",
+ "name": null,
+ "show": true,
+ "values": []
+ },
+ "yaxes": [
+ {
+ "$$hashKey": "object:4860",
+ "format": "s",
+ "label": "seconds",
+ "logBase": 1,
+ "max": null,
+ "min": null,
+ "show": true
+ },
+ {
+ "$$hashKey": "object:4861",
+ "format": "short",
+ "label": null,
+ "logBase": 1,
+ "max": null,
+ "min": null,
+ "show": false
+ }
+ ],
+ "yaxis": {
+ "align": false,
+ "alignLevel": null
+ }
+ },
+ {
+ "aliasColors": {},
+ "bars": false,
+ "dashLength": 10,
+ "dashes": false,
+ "datasource": "${DS_PROMETHEUS}",
+ "fill": 2,
+ "fillGradient": 0,
+ "gridPos": {
+ "h": 10,
+ "w": 12,
+ "x": 0,
+ "y": 38
+ },
+ "hiddenSeries": false,
+ "id": 64,
+ "legend": {
+ "alignAsTable": true,
+ "avg": true,
+ "current": true,
+ "max": true,
+ "min": true,
+ "show": true,
+ "total": false,
+ "values": true
+ },
+ "lines": true,
+ "linewidth": 1,
+ "links": [],
+ "nullPointMode": "null",
+ "options": {
+ "dataLinks": []
+ },
+ "percentage": false,
+ "pointradius": 5,
+ "points": false,
+ "renderer": "flot",
+ "seriesOverrides": [
+ {
+ "$$hashKey": "object:6323",
+ "alias": "/.*Max*./",
+ "color": "#890F02",
+ "fill": 0
+ }
+ ],
+ "spaceLength": 10,
+ "stack": false,
+ "steppedLine": false,
+ "targets": [
+ {
+ "expr": "process_max_fds{instance=\"$node\",job=\"$job\"}",
+ "interval": "",
+ "intervalFactor": 2,
+ "legendFormat": "Maximum open file descriptors",
+ "refId": "A",
+ "step": 240
+ },
+ {
+ "expr": "process_open_fds{instance=\"$node\",job=\"$job\"}",
+ "interval": "",
+ "intervalFactor": 2,
+ "legendFormat": "Open file descriptors",
+ "refId": "B",
+ "step": 240
+ }
+ ],
+ "thresholds": [],
+ "timeFrom": null,
+ "timeRegions": [],
+ "timeShift": null,
+ "title": "File Descriptors",
+ "tooltip": {
+ "shared": true,
+ "sort": 0,
+ "value_type": "individual"
+ },
+ "type": "graph",
+ "xaxis": {
+ "buckets": null,
+ "mode": "time",
+ "name": null,
+ "show": true,
+ "values": []
+ },
+ "yaxes": [
+ {
+ "$$hashKey": "object:6338",
+ "format": "short",
+ "label": "counter",
+ "logBase": 1,
+ "max": null,
+ "min": "0",
+ "show": true
+ },
+ {
+ "$$hashKey": "object:6339",
+ "format": "short",
+ "label": null,
+ "logBase": 1,
+ "max": null,
+ "min": null,
+ "show": false
+ }
+ ],
+ "yaxis": {
+ "align": false,
+ "alignLevel": null
+ }
+ }
+ ],
+ "repeat": null,
+ "title": "System Misc",
+ "type": "row"
+ },
+ {
+ "collapsed": true,
+ "datasource": "${DS_PROMETHEUS}",
+ "gridPos": {
+ "h": 1,
+ "w": 24,
+ "x": 0,
+ "y": 26
+ },
+ "id": 304,
+ "panels": [
+ {
+ "aliasColors": {},
+ "bars": false,
+ "dashLength": 10,
+ "dashes": false,
+ "datasource": "${DS_PROMETHEUS}",
+ "fill": 2,
+ "fillGradient": 0,
+ "gridPos": {
+ "h": 10,
+ "w": 12,
+ "x": 0,
+ "y": 26
+ },
+ "hiddenSeries": false,
+ "id": 158,
+ "legend": {
+ "alignAsTable": true,
+ "avg": true,
+ "current": true,
+ "max": true,
+ "min": true,
+ "show": true,
+ "total": false,
+ "values": true
+ },
+ "lines": true,
+ "linewidth": 1,
+ "links": [],
+ "nullPointMode": "null",
+ "options": {
+ "dataLinks": []
+ },
+ "percentage": false,
+ "pointradius": 5,
+ "points": false,
+ "renderer": "flot",
+ "seriesOverrides": [
+ {
+ "$$hashKey": "object:6726",
+ "alias": "/.*Critical*./",
+ "color": "#E24D42",
+ "fill": 0
+ },
+ {
+ "$$hashKey": "object:6727",
+ "alias": "/.*Max*./",
+ "color": "#EF843C",
+ "fill": 0
+ }
+ ],
+ "spaceLength": 10,
+ "stack": false,
+ "steppedLine": false,
+ "targets": [
+ {
+ "expr": "node_hwmon_temp_celsius{instance=\"$node\",job=\"$job\"}",
+ "format": "time_series",
+ "interval": "",
+ "intervalFactor": 2,
+ "legendFormat": "{{ chip }} {{ sensor }} temp",
+ "refId": "A",
+ "step": 240
+ },
+ {
+ "expr": "node_hwmon_temp_crit_alarm_celsius{instance=\"$node\",job=\"$job\"}",
+ "format": "time_series",
+ "hide": true,
+ "interval": "",
+ "intervalFactor": 2,
+ "legendFormat": "{{ chip }} {{ sensor }} Critical Alarm",
+ "refId": "B",
+ "step": 240
+ },
+ {
+ "expr": "node_hwmon_temp_crit_celsius{instance=\"$node\",job=\"$job\"}",
+ "format": "time_series",
+ "interval": "",
+ "intervalFactor": 2,
+ "legendFormat": "{{ chip }} {{ sensor }} Critical",
+ "refId": "C",
+ "step": 240
+ },
+ {
+ "expr": "node_hwmon_temp_crit_hyst_celsius{instance=\"$node\",job=\"$job\"}",
+ "format": "time_series",
+ "hide": true,
+ "interval": "",
+ "intervalFactor": 2,
+ "legendFormat": "{{ chip }} {{ sensor }} Critical Historical",
+ "refId": "D",
+ "step": 240
+ },
+ {
+ "expr": "node_hwmon_temp_max_celsius{instance=\"$node\",job=\"$job\"}",
+ "format": "time_series",
+ "hide": true,
+ "interval": "",
+ "intervalFactor": 2,
+ "legendFormat": "{{ chip }} {{ sensor }} Max",
+ "refId": "E",
+ "step": 240
+ }
+ ],
+ "thresholds": [],
+ "timeFrom": null,
+ "timeRegions": [],
+ "timeShift": null,
+ "title": "Hardware temperature monitor",
+ "tooltip": {
+ "shared": true,
+ "sort": 0,
+ "value_type": "individual"
+ },
+ "type": "graph",
+ "xaxis": {
+ "buckets": null,
+ "mode": "time",
+ "name": null,
+ "show": true,
+ "values": []
+ },
+ "yaxes": [
+ {
+ "$$hashKey": "object:6750",
+ "format": "celsius",
+ "label": "temperature",
+ "logBase": 1,
+ "max": null,
+ "min": "0",
+ "show": true
+ },
+ {
+ "$$hashKey": "object:6751",
+ "format": "short",
+ "label": null,
+ "logBase": 1,
+ "max": null,
+ "min": null,
+ "show": false
+ }
+ ],
+ "yaxis": {
+ "align": false,
+ "alignLevel": null
+ }
+ },
+ {
+ "aliasColors": {},
+ "bars": false,
+ "dashLength": 10,
+ "dashes": false,
+ "datasource": "${DS_PROMETHEUS}",
+ "fill": 2,
+ "fillGradient": 0,
+ "gridPos": {
+ "h": 10,
+ "w": 12,
+ "x": 12,
+ "y": 26
+ },
+ "hiddenSeries": false,
+ "id": 300,
+ "legend": {
+ "alignAsTable": true,
+ "avg": true,
+ "current": true,
+ "max": true,
+ "min": true,
+ "show": true,
+ "total": false,
+ "values": true
+ },
+ "lines": true,
+ "linewidth": 1,
+ "links": [],
+ "nullPointMode": "null",
+ "options": {
+ "dataLinks": []
+ },
+ "percentage": false,
+ "pointradius": 5,
+ "points": false,
+ "renderer": "flot",
+ "seriesOverrides": [
+ {
+ "$$hashKey": "object:1655",
+ "alias": "/.*Max*./",
+ "color": "#EF843C",
+ "fill": 0
+ }
+ ],
+ "spaceLength": 10,
+ "stack": false,
+ "steppedLine": false,
+ "targets": [
+ {
+ "expr": "node_cooling_device_cur_state{instance=\"$node\",job=\"$job\"}",
+ "format": "time_series",
+ "hide": false,
+ "interval": "",
+ "intervalFactor": 2,
+ "legendFormat": "Current {{ name }} in {{ type }}",
+ "refId": "A",
+ "step": 240
+ },
+ {
+ "expr": "node_cooling_device_max_state{instance=\"$node\",job=\"$job\"}",
+ "format": "time_series",
+ "interval": "",
+ "intervalFactor": 2,
+ "legendFormat": "Max {{ name }} in {{ type }}",
+ "refId": "B",
+ "step": 240
+ }
+ ],
+ "thresholds": [],
+ "timeFrom": null,
+ "timeRegions": [],
+ "timeShift": null,
+ "title": "Throttle cooling device",
+ "tooltip": {
+ "shared": true,
+ "sort": 0,
+ "value_type": "individual"
+ },
+ "type": "graph",
+ "xaxis": {
+ "buckets": null,
+ "mode": "time",
+ "name": null,
+ "show": true,
+ "values": []
+ },
+ "yaxes": [
+ {
+ "$$hashKey": "object:1678",
+ "format": "short",
+ "label": "counter",
+ "logBase": 1,
+ "max": null,
+ "min": null,
+ "show": true
+ },
+ {
+ "$$hashKey": "object:1679",
+ "format": "short",
+ "label": null,
+ "logBase": 1,
+ "max": null,
+ "min": null,
+ "show": false
+ }
+ ],
+ "yaxis": {
+ "align": false,
+ "alignLevel": null
+ }
+ },
+ {
+ "aliasColors": {},
+ "bars": false,
+ "dashLength": 10,
+ "dashes": false,
+ "datasource": "${DS_PROMETHEUS}",
+ "fill": 2,
+ "fillGradient": 0,
+ "gridPos": {
+ "h": 10,
+ "w": 12,
+ "x": 0,
+ "y": 36
+ },
+ "hiddenSeries": false,
+ "id": 302,
+ "legend": {
+ "alignAsTable": true,
+ "avg": true,
+ "current": true,
+ "max": true,
+ "min": true,
+ "show": true,
+ "total": false,
+ "values": true
+ },
+ "lines": true,
+ "linewidth": 1,
+ "links": [],
+ "nullPointMode": "null",
+ "options": {
+ "dataLinks": []
+ },
+ "percentage": false,
+ "pointradius": 5,
+ "points": false,
+ "renderer": "flot",
+ "seriesOverrides": [],
+ "spaceLength": 10,
+ "stack": false,
+ "steppedLine": false,
+ "targets": [
+ {
+ "expr": "node_power_supply_online{instance=\"$node\",job=\"$job\"}",
+ "format": "time_series",
+ "hide": false,
+ "interval": "",
+ "intervalFactor": 2,
+ "legendFormat": "{{ power_supply }} online",
+ "refId": "A",
+ "step": 240
+ }
+ ],
+ "thresholds": [],
+ "timeFrom": null,
+ "timeRegions": [],
+ "timeShift": null,
+ "title": "Power supply",
+ "tooltip": {
+ "shared": true,
+ "sort": 0,
+ "value_type": "individual"
+ },
+ "type": "graph",
+ "xaxis": {
+ "buckets": null,
+ "mode": "time",
+ "name": null,
+ "show": true,
+ "values": []
+ },
+ "yaxes": [
+ {
+ "$$hashKey": "object:1678",
+ "format": "short",
+ "label": "counter",
+ "logBase": 1,
+ "max": null,
+ "min": null,
+ "show": true
+ },
+ {
+ "$$hashKey": "object:1679",
+ "format": "short",
+ "label": null,
+ "logBase": 1,
+ "max": null,
+ "min": null,
+ "show": false
+ }
+ ],
+ "yaxis": {
+ "align": false,
+ "alignLevel": null
+ }
+ }
+ ],
+ "title": "Hardware Misc",
+ "type": "row"
+ },
+ {
+ "collapsed": true,
+ "datasource": "${DS_PROMETHEUS}",
+ "gridPos": {
+ "h": 1,
+ "w": 24,
+ "x": 0,
+ "y": 27
+ },
+ "id": 296,
+ "panels": [
+ {
+ "aliasColors": {},
+ "bars": false,
+ "dashLength": 10,
+ "dashes": false,
+ "datasource": "${DS_PROMETHEUS}",
+ "fill": 2,
+ "fillGradient": 0,
+ "gridPos": {
+ "h": 10,
+ "w": 12,
+ "x": 0,
+ "y": 10
+ },
+ "hiddenSeries": false,
+ "id": 297,
+ "legend": {
+ "alignAsTable": true,
+ "avg": true,
+ "current": true,
+ "max": true,
+ "min": true,
+ "show": true,
+ "total": false,
+ "values": true
+ },
+ "lines": true,
+ "linewidth": 1,
+ "links": [],
+ "nullPointMode": "null",
+ "options": {
+ "dataLinks": []
+ },
+ "percentage": false,
+ "pointradius": 5,
+ "points": false,
+ "renderer": "flot",
+ "seriesOverrides": [],
+ "spaceLength": 10,
+ "stack": false,
+ "steppedLine": false,
+ "targets": [
+ {
+ "expr": "irate(node_systemd_socket_accepted_connections_total{instance=\"$node\",job=\"$job\"}[5m])",
+ "format": "time_series",
+ "interval": "",
+ "intervalFactor": 2,
+ "legendFormat": "{{ name }} Connections",
+ "refId": "A",
+ "step": 240
+ }
+ ],
+ "thresholds": [],
+ "timeFrom": null,
+ "timeRegions": [],
+ "timeShift": null,
+ "title": "Systemd Sockets",
+ "tooltip": {
+ "shared": true,
+ "sort": 0,
+ "value_type": "individual"
+ },
+ "type": "graph",
+ "xaxis": {
+ "buckets": null,
+ "mode": "time",
+ "name": null,
+ "show": true,
+ "values": []
+ },
+ "yaxes": [
+ {
+ "format": "short",
+ "label": "counter",
+ "logBase": 1,
+ "max": null,
+ "min": "0",
+ "show": true
+ },
+ {
+ "format": "short",
+ "label": null,
+ "logBase": 1,
+ "max": null,
+ "min": null,
+ "show": false
+ }
+ ],
+ "yaxis": {
+ "align": false,
+ "alignLevel": null
+ }
+ },
+ {
+ "aliasColors": {},
+ "bars": false,
+ "dashLength": 10,
+ "dashes": false,
+ "datasource": "${DS_PROMETHEUS}",
+ "fill": 2,
+ "fillGradient": 0,
+ "gridPos": {
+ "h": 10,
+ "w": 12,
+ "x": 12,
+ "y": 10
+ },
+ "hiddenSeries": false,
+ "id": 298,
+ "legend": {
+ "alignAsTable": true,
+ "avg": true,
+ "current": true,
+ "max": true,
+ "min": true,
+ "show": true,
+ "total": false,
+ "values": true
+ },
+ "lines": true,
+ "linewidth": 1,
+ "links": [],
+ "nullPointMode": "null",
+ "options": {
+ "dataLinks": []
+ },
+ "percentage": false,
+ "pointradius": 5,
+ "points": false,
+ "renderer": "flot",
+ "seriesOverrides": [
+ {
+ "alias": "Failed",
+ "color": "#F2495C"
+ },
+ {
+ "alias": "Inactive",
+ "color": "#FF9830"
+ },
+ {
+ "alias": "Active",
+ "color": "#73BF69"
+ },
+ {
+ "alias": "Deactivating",
+ "color": "#FFCB7D"
+ },
+ {
+ "alias": "Activating",
+ "color": "#C8F2C2"
+ }
+ ],
+ "spaceLength": 10,
+ "stack": true,
+ "steppedLine": false,
+ "targets": [
+ {
+ "expr": "node_systemd_units{instance=\"$node\",job=\"$job\",state=\"activating\"}",
+ "format": "time_series",
+ "interval": "",
+ "intervalFactor": 2,
+ "legendFormat": "Activating",
+ "refId": "A",
+ "step": 240
+ },
+ {
+ "expr": "node_systemd_units{instance=\"$node\",job=\"$job\",state=\"active\"}",
+ "format": "time_series",
+ "interval": "",
+ "intervalFactor": 2,
+ "legendFormat": "Active",
+ "refId": "B",
+ "step": 240
+ },
+ {
+ "expr": "node_systemd_units{instance=\"$node\",job=\"$job\",state=\"deactivating\"}",
+ "format": "time_series",
+ "interval": "",
+ "intervalFactor": 2,
+ "legendFormat": "Deactivating",
+ "refId": "C",
+ "step": 240
+ },
+ {
+ "expr": "node_systemd_units{instance=\"$node\",job=\"$job\",state=\"failed\"}",
+ "format": "time_series",
+ "interval": "",
+ "intervalFactor": 2,
+ "legendFormat": "Failed",
+ "refId": "D",
+ "step": 240
+ },
+ {
+ "expr": "node_systemd_units{instance=\"$node\",job=\"$job\",state=\"inactive\"}",
+ "format": "time_series",
+ "interval": "",
+ "intervalFactor": 2,
+ "legendFormat": "Inactive",
+ "refId": "E",
+ "step": 240
+ }
+ ],
+ "thresholds": [],
+ "timeFrom": null,
+ "timeRegions": [],
+ "timeShift": null,
+ "title": "Systemd Units State",
+ "tooltip": {
+ "shared": true,
+ "sort": 0,
+ "value_type": "individual"
+ },
+ "type": "graph",
+ "xaxis": {
+ "buckets": null,
+ "mode": "time",
+ "name": null,
+ "show": true,
+ "values": []
+ },
+ "yaxes": [
+ {
+ "format": "short",
+ "label": "counter",
+ "logBase": 1,
+ "max": null,
+ "min": null,
+ "show": true
+ },
+ {
+ "format": "short",
+ "label": null,
+ "logBase": 1,
+ "max": null,
+ "min": null,
+ "show": false
+ }
+ ],
+ "yaxis": {
+ "align": false,
+ "alignLevel": null
+ }
+ }
+ ],
+ "title": "Systemd",
+ "type": "row"
+ },
+ {
+ "collapsed": true,
+ "datasource": "${DS_PROMETHEUS}",
+ "gridPos": {
+ "h": 1,
+ "w": 24,
+ "x": 0,
+ "y": 28
+ },
+ "id": 270,
+ "panels": [
+ {
+ "aliasColors": {},
+ "bars": false,
+ "dashLength": 10,
+ "dashes": false,
+ "datasource": "${DS_PROMETHEUS}",
+ "description": "",
+ "fill": 2,
+ "fillGradient": 0,
+ "gridPos": {
+ "h": 10,
+ "w": 12,
+ "x": 0,
+ "y": 29
+ },
+ "hiddenSeries": false,
+ "id": 9,
+ "legend": {
+ "alignAsTable": true,
+ "avg": true,
+ "current": true,
+ "hideZero": true,
+ "max": true,
+ "min": true,
+ "rightSide": false,
+ "show": true,
+ "total": false,
+ "values": true
+ },
+ "lines": true,
+ "linewidth": 1,
+ "links": [],
+ "maxPerRow": 6,
+ "nullPointMode": "null",
+ "options": {
+ "dataLinks": []
+ },
+ "percentage": false,
+ "pointradius": 5,
+ "points": false,
+ "renderer": "flot",
+ "repeat": null,
+ "seriesOverrides": [
+ {
+ "$$hashKey": "object:2033",
+ "alias": "/.*Read.*/",
+ "transform": "negative-Y"
+ },
+ {
+ "$$hashKey": "object:2034",
+ "alias": "/.*sda_.*/",
+ "color": "#7EB26D"
+ },
+ {
+ "$$hashKey": "object:2035",
+ "alias": "/.*sdb_.*/",
+ "color": "#EAB839"
+ },
+ {
+ "$$hashKey": "object:2036",
+ "alias": "/.*sdc_.*/",
+ "color": "#6ED0E0"
+ },
+ {
+ "$$hashKey": "object:2037",
+ "alias": "/.*sdd_.*/",
+ "color": "#EF843C"
+ },
+ {
+ "$$hashKey": "object:2038",
+ "alias": "/.*sde_.*/",
+ "color": "#E24D42"
+ },
+ {
+ "$$hashKey": "object:2039",
+ "alias": "/.*sda1.*/",
+ "color": "#584477"
+ },
+ {
+ "$$hashKey": "object:2040",
+ "alias": "/.*sda2_.*/",
+ "color": "#BA43A9"
+ },
+ {
+ "$$hashKey": "object:2041",
+ "alias": "/.*sda3_.*/",
+ "color": "#F4D598"
+ },
+ {
+ "$$hashKey": "object:2042",
+ "alias": "/.*sdb1.*/",
+ "color": "#0A50A1"
+ },
+ {
+ "$$hashKey": "object:2043",
+ "alias": "/.*sdb2.*/",
+ "color": "#BF1B00"
+ },
+ {
+ "$$hashKey": "object:2044",
+ "alias": "/.*sdb3.*/",
+ "color": "#E0752D"
+ },
+ {
+ "$$hashKey": "object:2045",
+ "alias": "/.*sdc1.*/",
+ "color": "#962D82"
+ },
+ {
+ "$$hashKey": "object:2046",
+ "alias": "/.*sdc2.*/",
+ "color": "#614D93"
+ },
+ {
+ "$$hashKey": "object:2047",
+ "alias": "/.*sdc3.*/",
+ "color": "#9AC48A"
+ },
+ {
+ "$$hashKey": "object:2048",
+ "alias": "/.*sdd1.*/",
+ "color": "#65C5DB"
+ },
+ {
+ "$$hashKey": "object:2049",
+ "alias": "/.*sdd2.*/",
+ "color": "#F9934E"
+ },
+ {
+ "$$hashKey": "object:2050",
+ "alias": "/.*sdd3.*/",
+ "color": "#EA6460"
+ },
+ {
+ "$$hashKey": "object:2051",
+ "alias": "/.*sde1.*/",
+ "color": "#E0F9D7"
+ },
+ {
+ "$$hashKey": "object:2052",
+ "alias": "/.*sdd2.*/",
+ "color": "#FCEACA"
+ },
+ {
+ "$$hashKey": "object:2053",
+ "alias": "/.*sde3.*/",
+ "color": "#F9E2D2"
+ }
+ ],
+ "spaceLength": 10,
+ "stack": false,
+ "steppedLine": false,
+ "targets": [
+ {
+ "expr": "irate(node_disk_reads_completed_total{instance=\"$node\",job=\"$job\"}[5m])",
+ "intervalFactor": 4,
+ "legendFormat": "{{device}} - Reads completed",
+ "refId": "A",
+ "step": 8
+ },
+ {
+ "expr": "irate(node_disk_writes_completed_total{instance=\"$node\",job=\"$job\"}[5m])",
+ "intervalFactor": 2,
+ "legendFormat": "{{device}} - Writes completed",
+ "refId": "B",
+ "step": 4
+ }
+ ],
+ "thresholds": [],
+ "timeFrom": null,
+ "timeRegions": [],
+ "timeShift": null,
+ "title": "Disk IOps Completed",
+ "tooltip": {
+ "shared": false,
+ "sort": 0,
+ "value_type": "individual"
+ },
+ "type": "graph",
+ "xaxis": {
+ "buckets": null,
+ "mode": "time",
+ "name": null,
+ "show": true,
+ "values": []
+ },
+ "yaxes": [
+ {
+ "$$hashKey": "object:2186",
+ "format": "iops",
+ "label": "IO read (-) / write (+)",
+ "logBase": 1,
+ "max": null,
+ "min": null,
+ "show": true
+ },
+ {
+ "$$hashKey": "object:2187",
+ "format": "short",
+ "label": null,
+ "logBase": 1,
+ "max": null,
+ "min": null,
+ "show": false
+ }
+ ],
+ "yaxis": {
+ "align": false,
+ "alignLevel": null
+ }
+ },
+ {
+ "aliasColors": {},
+ "bars": false,
+ "dashLength": 10,
+ "dashes": false,
+ "datasource": "${DS_PROMETHEUS}",
+ "description": "",
+ "fill": 2,
+ "fillGradient": 0,
+ "gridPos": {
+ "h": 10,
+ "w": 12,
+ "x": 12,
+ "y": 29
+ },
+ "hiddenSeries": false,
+ "id": 33,
+ "legend": {
+ "alignAsTable": true,
+ "avg": true,
+ "current": true,
+ "hideZero": true,
+ "max": true,
+ "min": true,
+ "rightSide": false,
+ "show": true,
+ "total": false,
+ "values": true
+ },
+ "lines": true,
+ "linewidth": 1,
+ "links": [],
+ "maxPerRow": 6,
+ "nullPointMode": "null",
+ "options": {
+ "dataLinks": []
+ },
+ "percentage": false,
+ "pointradius": 5,
+ "points": false,
+ "renderer": "flot",
+ "seriesOverrides": [
+ {
+ "alias": "/.*Read.*/",
+ "transform": "negative-Y"
+ },
+ {
+ "alias": "/.*sda_.*/",
+ "color": "#7EB26D"
+ },
+ {
+ "alias": "/.*sdb_.*/",
+ "color": "#EAB839"
+ },
+ {
+ "alias": "/.*sdc_.*/",
+ "color": "#6ED0E0"
+ },
+ {
+ "alias": "/.*sdd_.*/",
+ "color": "#EF843C"
+ },
+ {
+ "alias": "/.*sde_.*/",
+ "color": "#E24D42"
+ },
+ {
+ "alias": "/.*sda1.*/",
+ "color": "#584477"
+ },
+ {
+ "alias": "/.*sda2_.*/",
+ "color": "#BA43A9"
+ },
+ {
+ "alias": "/.*sda3_.*/",
+ "color": "#F4D598"
+ },
+ {
+ "alias": "/.*sdb1.*/",
+ "color": "#0A50A1"
+ },
+ {
+ "alias": "/.*sdb2.*/",
+ "color": "#BF1B00"
+ },
+ {
+ "alias": "/.*sdb3.*/",
+ "color": "#E0752D"
+ },
+ {
+ "alias": "/.*sdc1.*/",
+ "color": "#962D82"
+ },
+ {
+ "alias": "/.*sdc2.*/",
+ "color": "#614D93"
+ },
+ {
+ "alias": "/.*sdc3.*/",
+ "color": "#9AC48A"
+ },
+ {
+ "alias": "/.*sdd1.*/",
+ "color": "#65C5DB"
+ },
+ {
+ "alias": "/.*sdd2.*/",
+ "color": "#F9934E"
+ },
+ {
+ "alias": "/.*sdd3.*/",
+ "color": "#EA6460"
+ },
+ {
+ "alias": "/.*sde1.*/",
+ "color": "#E0F9D7"
+ },
+ {
+ "alias": "/.*sdd2.*/",
+ "color": "#FCEACA"
+ },
+ {
+ "alias": "/.*sde3.*/",
+ "color": "#F9E2D2"
+ }
+ ],
+ "spaceLength": 10,
+ "stack": false,
+ "steppedLine": false,
+ "targets": [
+ {
+ "expr": "irate(node_disk_read_bytes_total{instance=\"$node\",job=\"$job\"}[5m])",
+ "format": "time_series",
+ "intervalFactor": 4,
+ "legendFormat": "{{device}} - Read bytes",
+ "refId": "A",
+ "step": 8
+ },
+ {
+ "expr": "irate(node_disk_written_bytes_total{instance=\"$node\",job=\"$job\"}[5m])",
+ "format": "time_series",
+ "intervalFactor": 2,
+ "legendFormat": "{{device}} - Written bytes",
+ "refId": "B",
+ "step": 4
+ }
+ ],
+ "thresholds": [],
+ "timeFrom": null,
+ "timeRegions": [],
+ "timeShift": null,
+ "title": "Disk R/W Data",
+ "tooltip": {
+ "shared": false,
+ "sort": 0,
+ "value_type": "individual"
+ },
+ "type": "graph",
+ "xaxis": {
+ "buckets": null,
+ "mode": "time",
+ "name": null,
+ "show": true,
+ "values": []
+ },
+ "yaxes": [
+ {
+ "format": "Bps",
+ "label": "bytes read (-) / write (+)",
+ "logBase": 1,
+ "max": null,
+ "min": null,
+ "show": true
+ },
+ {
+ "format": "short",
+ "label": null,
+ "logBase": 1,
+ "max": null,
+ "min": null,
+ "show": false
+ }
+ ],
+ "yaxis": {
+ "align": false,
+ "alignLevel": null
+ }
+ },
+ {
+ "aliasColors": {},
+ "bars": false,
+ "dashLength": 10,
+ "dashes": false,
+ "datasource": "${DS_PROMETHEUS}",
+ "description": "",
+ "fill": 3,
+ "fillGradient": 0,
+ "gridPos": {
+ "h": 10,
+ "w": 12,
+ "x": 0,
+ "y": 39
+ },
+ "hiddenSeries": false,
+ "id": 37,
+ "legend": {
+ "alignAsTable": true,
+ "avg": true,
+ "current": true,
+ "hideZero": true,
+ "max": true,
+ "min": true,
+ "rightSide": false,
+ "show": true,
+ "sort": "current",
+ "sortDesc": true,
+ "total": false,
+ "values": true
+ },
+ "lines": true,
+ "linewidth": 1,
+ "links": [],
+ "maxPerRow": 6,
+ "nullPointMode": "null",
+ "options": {
+ "dataLinks": []
+ },
+ "percentage": false,
+ "pointradius": 5,
+ "points": false,
+ "renderer": "flot",
+ "seriesOverrides": [
+ {
+ "alias": "/.*Read.*/",
+ "transform": "negative-Y"
+ },
+ {
+ "alias": "/.*sda_.*/",
+ "color": "#7EB26D"
+ },
+ {
+ "alias": "/.*sdb_.*/",
+ "color": "#EAB839"
+ },
+ {
+ "alias": "/.*sdc_.*/",
+ "color": "#6ED0E0"
+ },
+ {
+ "alias": "/.*sdd_.*/",
+ "color": "#EF843C"
+ },
+ {
+ "alias": "/.*sde_.*/",
+ "color": "#E24D42"
+ },
+ {
+ "alias": "/.*sda1.*/",
+ "color": "#584477"
+ },
+ {
+ "alias": "/.*sda2_.*/",
+ "color": "#BA43A9"
+ },
+ {
+ "alias": "/.*sda3_.*/",
+ "color": "#F4D598"
+ },
+ {
+ "alias": "/.*sdb1.*/",
+ "color": "#0A50A1"
+ },
+ {
+ "alias": "/.*sdb2.*/",
+ "color": "#BF1B00"
+ },
+ {
+ "alias": "/.*sdb3.*/",
+ "color": "#E0752D"
+ },
+ {
+ "alias": "/.*sdc1.*/",
+ "color": "#962D82"
+ },
+ {
+ "alias": "/.*sdc2.*/",
+ "color": "#614D93"
+ },
+ {
+ "alias": "/.*sdc3.*/",
+ "color": "#9AC48A"
+ },
+ {
+ "alias": "/.*sdd1.*/",
+ "color": "#65C5DB"
+ },
+ {
+ "alias": "/.*sdd2.*/",
+ "color": "#F9934E"
+ },
+ {
+ "alias": "/.*sdd3.*/",
+ "color": "#EA6460"
+ },
+ {
+ "alias": "/.*sde1.*/",
+ "color": "#E0F9D7"
+ },
+ {
+ "alias": "/.*sdd2.*/",
+ "color": "#FCEACA"
+ },
+ {
+ "alias": "/.*sde3.*/",
+ "color": "#F9E2D2"
+ }
+ ],
+ "spaceLength": 10,
+ "stack": false,
+ "steppedLine": false,
+ "targets": [
+ {
+ "expr": "irate(node_disk_read_time_seconds_total{instance=\"$node\",job=\"$job\"}[5m])",
+ "hide": false,
+ "intervalFactor": 4,
+ "legendFormat": "{{device}} - Read time",
+ "refId": "A",
+ "step": 8
+ },
+ {
+ "expr": "irate(node_disk_write_time_seconds_total{instance=\"$node\",job=\"$job\"}[5m])",
+ "hide": false,
+ "intervalFactor": 2,
+ "legendFormat": "{{device}} - Write time",
+ "refId": "B",
+ "step": 4
+ }
+ ],
+ "thresholds": [],
+ "timeFrom": null,
+ "timeRegions": [],
+ "timeShift": null,
+ "title": "Disk R/W Time",
+ "tooltip": {
+ "shared": false,
+ "sort": 0,
+ "value_type": "individual"
+ },
+ "type": "graph",
+ "xaxis": {
+ "buckets": null,
+ "mode": "time",
+ "name": null,
+ "show": true,
+ "values": []
+ },
+ "yaxes": [
+ {
+ "format": "s",
+ "label": "time. read (-) / write (+)",
+ "logBase": 1,
+ "max": null,
+ "min": null,
+ "show": true
+ },
+ {
+ "format": "short",
+ "label": null,
+ "logBase": 1,
+ "max": null,
+ "min": null,
+ "show": false
+ }
+ ],
+ "yaxis": {
+ "align": false,
+ "alignLevel": null
+ }
+ },
+ {
+ "aliasColors": {},
+ "bars": false,
+ "dashLength": 10,
+ "dashes": false,
+ "datasource": "${DS_PROMETHEUS}",
+ "description": "",
+ "fill": 2,
+ "fillGradient": 0,
+ "gridPos": {
+ "h": 10,
+ "w": 12,
+ "x": 12,
+ "y": 39
+ },
+ "hiddenSeries": false,
+ "id": 35,
+ "legend": {
+ "alignAsTable": true,
+ "avg": true,
+ "current": true,
+ "hideZero": true,
+ "max": true,
+ "min": true,
+ "rightSide": false,
+ "show": true,
+ "sort": "current",
+ "sortDesc": true,
+ "total": false,
+ "values": true
+ },
+ "lines": true,
+ "linewidth": 1,
+ "links": [],
+ "maxPerRow": 6,
+ "nullPointMode": "null",
+ "options": {
+ "dataLinks": []
+ },
+ "percentage": false,
+ "pointradius": 5,
+ "points": false,
+ "renderer": "flot",
+ "seriesOverrides": [
+ {
+ "alias": "/.*sda_.*/",
+ "color": "#7EB26D"
+ },
+ {
+ "alias": "/.*sdb_.*/",
+ "color": "#EAB839"
+ },
+ {
+ "alias": "/.*sdc_.*/",
+ "color": "#6ED0E0"
+ },
+ {
+ "alias": "/.*sdd_.*/",
+ "color": "#EF843C"
+ },
+ {
+ "alias": "/.*sde_.*/",
+ "color": "#E24D42"
+ },
+ {
+ "alias": "/.*sda1.*/",
+ "color": "#584477"
+ },
+ {
+ "alias": "/.*sda2_.*/",
+ "color": "#BA43A9"
+ },
+ {
+ "alias": "/.*sda3_.*/",
+ "color": "#F4D598"
+ },
+ {
+ "alias": "/.*sdb1.*/",
+ "color": "#0A50A1"
+ },
+ {
+ "alias": "/.*sdb2.*/",
+ "color": "#BF1B00"
+ },
+ {
+ "alias": "/.*sdb3.*/",
+ "color": "#E0752D"
+ },
+ {
+ "alias": "/.*sdc1.*/",
+ "color": "#962D82"
+ },
+ {
+ "alias": "/.*sdc2.*/",
+ "color": "#614D93"
+ },
+ {
+ "alias": "/.*sdc3.*/",
+ "color": "#9AC48A"
+ },
+ {
+ "alias": "/.*sdd1.*/",
+ "color": "#65C5DB"
+ },
+ {
+ "alias": "/.*sdd2.*/",
+ "color": "#F9934E"
+ },
+ {
+ "alias": "/.*sdd3.*/",
+ "color": "#EA6460"
+ },
+ {
+ "alias": "/.*sde1.*/",
+ "color": "#E0F9D7"
+ },
+ {
+ "alias": "/.*sdd2.*/",
+ "color": "#FCEACA"
+ },
+ {
+ "alias": "/.*sde3.*/",
+ "color": "#F9E2D2"
+ }
+ ],
+ "spaceLength": 10,
+ "stack": false,
+ "steppedLine": false,
+ "targets": [
+ {
+ "expr": "irate(node_disk_io_time_weighted_seconds_total{instance=\"$node\",job=\"$job\"}[5m])",
+ "intervalFactor": 4,
+ "legendFormat": "{{device}} - IO time weighted",
+ "refId": "A",
+ "step": 8
+ }
+ ],
+ "thresholds": [],
+ "timeFrom": null,
+ "timeRegions": [],
+ "timeShift": null,
+ "title": "Disk IOs Weighted",
+ "tooltip": {
+ "shared": false,
+ "sort": 0,
+ "value_type": "individual"
+ },
+ "type": "graph",
+ "xaxis": {
+ "buckets": null,
+ "mode": "time",
+ "name": null,
+ "show": true,
+ "values": []
+ },
+ "yaxes": [
+ {
+ "format": "s",
+ "label": "time",
+ "logBase": 1,
+ "max": null,
+ "min": "0",
+ "show": true
+ },
+ {
+ "format": "short",
+ "label": null,
+ "logBase": 1,
+ "max": null,
+ "min": null,
+ "show": false
+ }
+ ],
+ "yaxis": {
+ "align": false,
+ "alignLevel": null
+ }
+ },
+ {
+ "aliasColors": {},
+ "bars": false,
+ "dashLength": 10,
+ "dashes": false,
+ "datasource": "${DS_PROMETHEUS}",
+ "description": "",
+ "fill": 2,
+ "fillGradient": 0,
+ "gridPos": {
+ "h": 10,
+ "w": 12,
+ "x": 0,
+ "y": 49
+ },
+ "hiddenSeries": false,
+ "id": 133,
+ "legend": {
+ "alignAsTable": true,
+ "avg": true,
+ "current": true,
+ "hideZero": true,
+ "max": true,
+ "min": true,
+ "rightSide": false,
+ "show": true,
+ "sort": "current",
+ "sortDesc": true,
+ "total": false,
+ "values": true
+ },
+ "lines": true,
+ "linewidth": 1,
+ "links": [],
+ "maxPerRow": 6,
+ "nullPointMode": "null",
+ "options": {
+ "dataLinks": []
+ },
+ "percentage": false,
+ "pointradius": 5,
+ "points": false,
+ "renderer": "flot",
+ "seriesOverrides": [
+ {
+ "alias": "/.*Read.*/",
+ "transform": "negative-Y"
+ },
+ {
+ "alias": "/.*sda_.*/",
+ "color": "#7EB26D"
+ },
+ {
+ "alias": "/.*sdb_.*/",
+ "color": "#EAB839"
+ },
+ {
+ "alias": "/.*sdc_.*/",
+ "color": "#6ED0E0"
+ },
+ {
+ "alias": "/.*sdd_.*/",
+ "color": "#EF843C"
+ },
+ {
+ "alias": "/.*sde_.*/",
+ "color": "#E24D42"
+ },
+ {
+ "alias": "/.*sda1.*/",
+ "color": "#584477"
+ },
+ {
+ "alias": "/.*sda2_.*/",
+ "color": "#BA43A9"
+ },
+ {
+ "alias": "/.*sda3_.*/",
+ "color": "#F4D598"
+ },
+ {
+ "alias": "/.*sdb1.*/",
+ "color": "#0A50A1"
+ },
+ {
+ "alias": "/.*sdb2.*/",
+ "color": "#BF1B00"
+ },
+ {
+ "alias": "/.*sdb3.*/",
+ "color": "#E0752D"
+ },
+ {
+ "alias": "/.*sdc1.*/",
+ "color": "#962D82"
+ },
+ {
+ "alias": "/.*sdc2.*/",
+ "color": "#614D93"
+ },
+ {
+ "alias": "/.*sdc3.*/",
+ "color": "#9AC48A"
+ },
+ {
+ "alias": "/.*sdd1.*/",
+ "color": "#65C5DB"
+ },
+ {
+ "alias": "/.*sdd2.*/",
+ "color": "#F9934E"
+ },
+ {
+ "alias": "/.*sdd3.*/",
+ "color": "#EA6460"
+ },
+ {
+ "alias": "/.*sde1.*/",
+ "color": "#E0F9D7"
+ },
+ {
+ "alias": "/.*sdd2.*/",
+ "color": "#FCEACA"
+ },
+ {
+ "alias": "/.*sde3.*/",
+ "color": "#F9E2D2"
+ }
+ ],
+ "spaceLength": 10,
+ "stack": false,
+ "steppedLine": false,
+ "targets": [
+ {
+ "expr": "irate(node_disk_reads_merged_total{instance=\"$node\",job=\"$job\"}[5m])",
+ "intervalFactor": 2,
+ "legendFormat": "{{device}} - Read merged",
+ "refId": "A",
+ "step": 4
+ },
+ {
+ "expr": "irate(node_disk_writes_merged_total{instance=\"$node\",job=\"$job\"}[5m])",
+ "intervalFactor": 2,
+ "legendFormat": "{{device}} - Write merged",
+ "refId": "B",
+ "step": 4
+ }
+ ],
+ "thresholds": [],
+ "timeFrom": null,
+ "timeRegions": [],
+ "timeShift": null,
+ "title": "Disk R/W Merged",
+ "tooltip": {
+ "shared": false,
+ "sort": 0,
+ "value_type": "individual"
+ },
+ "type": "graph",
+ "xaxis": {
+ "buckets": null,
+ "mode": "time",
+ "name": null,
+ "show": true,
+ "values": []
+ },
+ "yaxes": [
+ {
+ "format": "iops",
+ "label": "I/Os",
+ "logBase": 1,
+ "max": null,
+ "min": null,
+ "show": true
+ },
+ {
+ "format": "short",
+ "label": null,
+ "logBase": 1,
+ "max": null,
+ "min": null,
+ "show": false
+ }
+ ],
+ "yaxis": {
+ "align": false,
+ "alignLevel": null
+ }
+ },
+ {
+ "aliasColors": {},
+ "bars": false,
+ "dashLength": 10,
+ "dashes": false,
+ "datasource": "${DS_PROMETHEUS}",
+ "description": "",
+ "fill": 3,
+ "fillGradient": 0,
+ "gridPos": {
+ "h": 10,
+ "w": 12,
+ "x": 12,
+ "y": 49
+ },
+ "hiddenSeries": false,
+ "id": 36,
+ "legend": {
+ "alignAsTable": true,
+ "avg": true,
+ "current": true,
+ "hideZero": true,
+ "max": true,
+ "min": true,
+ "rightSide": false,
+ "show": true,
+ "sort": "current",
+ "sortDesc": true,
+ "total": false,
+ "values": true
+ },
+ "lines": true,
+ "linewidth": 1,
+ "links": [],
+ "maxPerRow": 6,
+ "nullPointMode": "null",
+ "options": {
+ "dataLinks": []
+ },
+ "percentage": false,
+ "pointradius": 5,
+ "points": false,
+ "renderer": "flot",
+ "seriesOverrides": [
+ {
+ "alias": "/.*sda_.*/",
+ "color": "#7EB26D"
+ },
+ {
+ "alias": "/.*sdb_.*/",
+ "color": "#EAB839"
+ },
+ {
+ "alias": "/.*sdc_.*/",
+ "color": "#6ED0E0"
+ },
+ {
+ "alias": "/.*sdd_.*/",
+ "color": "#EF843C"
+ },
+ {
+ "alias": "/.*sde_.*/",
+ "color": "#E24D42"
+ },
+ {
+ "alias": "/.*sda1.*/",
+ "color": "#584477"
+ },
+ {
+ "alias": "/.*sda2_.*/",
+ "color": "#BA43A9"
+ },
+ {
+ "alias": "/.*sda3_.*/",
+ "color": "#F4D598"
+ },
+ {
+ "alias": "/.*sdb1.*/",
+ "color": "#0A50A1"
+ },
+ {
+ "alias": "/.*sdb2.*/",
+ "color": "#BF1B00"
+ },
+ {
+ "alias": "/.*sdb3.*/",
+ "color": "#E0752D"
+ },
+ {
+ "alias": "/.*sdc1.*/",
+ "color": "#962D82"
+ },
+ {
+ "alias": "/.*sdc2.*/",
+ "color": "#614D93"
+ },
+ {
+ "alias": "/.*sdc3.*/",
+ "color": "#9AC48A"
+ },
+ {
+ "alias": "/.*sdd1.*/",
+ "color": "#65C5DB"
+ },
+ {
+ "alias": "/.*sdd2.*/",
+ "color": "#F9934E"
+ },
+ {
+ "alias": "/.*sdd3.*/",
+ "color": "#EA6460"
+ },
+ {
+ "alias": "/.*sde1.*/",
+ "color": "#E0F9D7"
+ },
+ {
+ "alias": "/.*sdd2.*/",
+ "color": "#FCEACA"
+ },
+ {
+ "alias": "/.*sde3.*/",
+ "color": "#F9E2D2"
+ }
+ ],
+ "spaceLength": 10,
+ "stack": false,
+ "steppedLine": false,
+ "targets": [
+ {
+ "expr": "irate(node_disk_io_time_seconds_total{instance=\"$node\",job=\"$job\"}[5m])",
+ "intervalFactor": 4,
+ "legendFormat": "{{device}} - IO time",
+ "refId": "A",
+ "step": 8
+ },
+ {
+ "expr": "irate(node_disk_discard_time_seconds_total{instance=\"$node\",job=\"$job\"}[5m])",
+ "interval": "",
+ "intervalFactor": 4,
+ "legendFormat": "{{device}} - discard time",
+ "refId": "B",
+ "step": 8
+ }
+ ],
+ "thresholds": [],
+ "timeFrom": null,
+ "timeRegions": [],
+ "timeShift": null,
+ "title": "Time Spent Doing I/Os",
+ "tooltip": {
+ "shared": false,
+ "sort": 0,
+ "value_type": "individual"
+ },
+ "type": "graph",
+ "xaxis": {
+ "buckets": null,
+ "mode": "time",
+ "name": null,
+ "show": true,
+ "values": []
+ },
+ "yaxes": [
+ {
+ "format": "s",
+ "label": "time",
+ "logBase": 1,
+ "max": null,
+ "min": "0",
+ "show": true
+ },
+ {
+ "format": "short",
+ "label": null,
+ "logBase": 1,
+ "max": null,
+ "min": null,
+ "show": false
+ }
+ ],
+ "yaxis": {
+ "align": false,
+ "alignLevel": null
+ }
+ },
+ {
+ "aliasColors": {},
+ "bars": false,
+ "dashLength": 10,
+ "dashes": false,
+ "datasource": "${DS_PROMETHEUS}",
+ "description": "",
+ "fill": 2,
+ "fillGradient": 0,
+ "gridPos": {
+ "h": 10,
+ "w": 12,
+ "x": 0,
+ "y": 59
+ },
+ "hiddenSeries": false,
+ "id": 34,
+ "legend": {
+ "alignAsTable": true,
+ "avg": true,
+ "current": true,
+ "hideZero": true,
+ "max": true,
+ "min": true,
+ "rightSide": false,
+ "show": true,
+ "sort": "current",
+ "sortDesc": true,
+ "total": false,
+ "values": true
+ },
+ "lines": true,
+ "linewidth": 1,
+ "links": [],
+ "maxPerRow": 6,
+ "nullPointMode": "null",
+ "options": {
+ "dataLinks": []
+ },
+ "percentage": false,
+ "pointradius": 5,
+ "points": false,
+ "renderer": "flot",
+ "seriesOverrides": [
+ {
+ "alias": "/.*sda_.*/",
+ "color": "#7EB26D"
+ },
+ {
+ "alias": "/.*sdb_.*/",
+ "color": "#EAB839"
+ },
+ {
+ "alias": "/.*sdc_.*/",
+ "color": "#6ED0E0"
+ },
+ {
+ "alias": "/.*sdd_.*/",
+ "color": "#EF843C"
+ },
+ {
+ "alias": "/.*sde_.*/",
+ "color": "#E24D42"
+ },
+ {
+ "alias": "/.*sda1.*/",
+ "color": "#584477"
+ },
+ {
+ "alias": "/.*sda2_.*/",
+ "color": "#BA43A9"
+ },
+ {
+ "alias": "/.*sda3_.*/",
+ "color": "#F4D598"
+ },
+ {
+ "alias": "/.*sdb1.*/",
+ "color": "#0A50A1"
+ },
+ {
+ "alias": "/.*sdb2.*/",
+ "color": "#BF1B00"
+ },
+ {
+ "alias": "/.*sdb3.*/",
+ "color": "#E0752D"
+ },
+ {
+ "alias": "/.*sdc1.*/",
+ "color": "#962D82"
+ },
+ {
+ "alias": "/.*sdc2.*/",
+ "color": "#614D93"
+ },
+ {
+ "alias": "/.*sdc3.*/",
+ "color": "#9AC48A"
+ },
+ {
+ "alias": "/.*sdd1.*/",
+ "color": "#65C5DB"
+ },
+ {
+ "alias": "/.*sdd2.*/",
+ "color": "#F9934E"
+ },
+ {
+ "alias": "/.*sdd3.*/",
+ "color": "#EA6460"
+ },
+ {
+ "alias": "/.*sde1.*/",
+ "color": "#E0F9D7"
+ },
+ {
+ "alias": "/.*sdd2.*/",
+ "color": "#FCEACA"
+ },
+ {
+ "alias": "/.*sde3.*/",
+ "color": "#F9E2D2"
+ }
+ ],
+ "spaceLength": 10,
+ "stack": false,
+ "steppedLine": false,
+ "targets": [
+ {
+ "expr": "irate(node_disk_io_now{instance=\"$node\",job=\"$job\"}[5m])",
+ "intervalFactor": 4,
+ "legendFormat": "{{device}} - IO now",
+ "refId": "A",
+ "step": 8
+ }
+ ],
+ "thresholds": [],
+ "timeFrom": null,
+ "timeRegions": [],
+ "timeShift": null,
+ "title": "Disk IOs Current in Progress",
+ "tooltip": {
+ "shared": false,
+ "sort": 0,
+ "value_type": "individual"
+ },
+ "type": "graph",
+ "xaxis": {
+ "buckets": null,
+ "mode": "time",
+ "name": null,
+ "show": true,
+ "values": []
+ },
+ "yaxes": [
+ {
+ "format": "iops",
+ "label": "I/Os",
+ "logBase": 1,
+ "max": null,
+ "min": "0",
+ "show": true
+ },
+ {
+ "format": "short",
+ "label": null,
+ "logBase": 1,
+ "max": null,
+ "min": null,
+ "show": false
+ }
+ ],
+ "yaxis": {
+ "align": false,
+ "alignLevel": null
+ }
+ },
+ {
+ "aliasColors": {},
+ "bars": false,
+ "dashLength": 10,
+ "dashes": false,
+ "datasource": "${DS_PROMETHEUS}",
+ "description": "",
+ "fill": 2,
+ "fillGradient": 0,
+ "gridPos": {
+ "h": 10,
+ "w": 12,
+ "x": 12,
+ "y": 59
+ },
+ "hiddenSeries": false,
+ "id": 301,
+ "legend": {
+ "alignAsTable": true,
+ "avg": true,
+ "current": true,
+ "hideZero": true,
+ "max": true,
+ "min": true,
+ "rightSide": false,
+ "show": true,
+ "total": false,
+ "values": true
+ },
+ "lines": true,
+ "linewidth": 1,
+ "links": [],
+ "maxPerRow": 6,
+ "nullPointMode": "null as zero",
+ "options": {
+ "dataLinks": []
+ },
+ "percentage": false,
+ "pointradius": 5,
+ "points": false,
+ "renderer": "flot",
+ "seriesOverrides": [
+ {
+ "$$hashKey": "object:2034",
+ "alias": "/.*sda_.*/",
+ "color": "#7EB26D"
+ },
+ {
+ "$$hashKey": "object:2035",
+ "alias": "/.*sdb_.*/",
+ "color": "#EAB839"
+ },
+ {
+ "$$hashKey": "object:2036",
+ "alias": "/.*sdc_.*/",
+ "color": "#6ED0E0"
+ },
+ {
+ "$$hashKey": "object:2037",
+ "alias": "/.*sdd_.*/",
+ "color": "#EF843C"
+ },
+ {
+ "$$hashKey": "object:2038",
+ "alias": "/.*sde_.*/",
+ "color": "#E24D42"
+ },
+ {
+ "$$hashKey": "object:2039",
+ "alias": "/.*sda1.*/",
+ "color": "#584477"
+ },
+ {
+ "$$hashKey": "object:2040",
+ "alias": "/.*sda2_.*/",
+ "color": "#BA43A9"
+ },
+ {
+ "$$hashKey": "object:2041",
+ "alias": "/.*sda3_.*/",
+ "color": "#F4D598"
+ },
+ {
+ "$$hashKey": "object:2042",
+ "alias": "/.*sdb1.*/",
+ "color": "#0A50A1"
+ },
+ {
+ "$$hashKey": "object:2043",
+ "alias": "/.*sdb2.*/",
+ "color": "#BF1B00"
+ },
+ {
+ "$$hashKey": "object:2044",
+ "alias": "/.*sdb3.*/",
+ "color": "#E0752D"
+ },
+ {
+ "$$hashKey": "object:2045",
+ "alias": "/.*sdc1.*/",
+ "color": "#962D82"
+ },
+ {
+ "$$hashKey": "object:2046",
+ "alias": "/.*sdc2.*/",
+ "color": "#614D93"
+ },
+ {
+ "$$hashKey": "object:2047",
+ "alias": "/.*sdc3.*/",
+ "color": "#9AC48A"
+ },
+ {
+ "$$hashKey": "object:2048",
+ "alias": "/.*sdd1.*/",
+ "color": "#65C5DB"
+ },
+ {
+ "$$hashKey": "object:2049",
+ "alias": "/.*sdd2.*/",
+ "color": "#F9934E"
+ },
+ {
+ "$$hashKey": "object:2050",
+ "alias": "/.*sdd3.*/",
+ "color": "#EA6460"
+ },
+ {
+ "$$hashKey": "object:2051",
+ "alias": "/.*sde1.*/",
+ "color": "#E0F9D7"
+ },
+ {
+ "$$hashKey": "object:2052",
+ "alias": "/.*sdd2.*/",
+ "color": "#FCEACA"
+ },
+ {
+ "$$hashKey": "object:2053",
+ "alias": "/.*sde3.*/",
+ "color": "#F9E2D2"
+ }
+ ],
+ "spaceLength": 10,
+ "stack": false,
+ "steppedLine": false,
+ "targets": [
+ {
+ "expr": "irate(node_disk_discards_completed_total{instance=\"$node\",job=\"$job\"}[5m])",
+ "interval": "",
+ "intervalFactor": 4,
+ "legendFormat": "{{device}} - Discards completed",
+ "refId": "A",
+ "step": 8
+ },
+ {
+ "expr": "irate(node_disk_discards_merged_total{instance=\"$node\",job=\"$job\"}[5m])",
+ "interval": "",
+ "intervalFactor": 2,
+ "legendFormat": "{{device}} - Discards merged",
+ "refId": "B",
+ "step": 4
+ }
+ ],
+ "thresholds": [],
+ "timeFrom": null,
+ "timeRegions": [],
+ "timeShift": null,
+ "title": "Disk IOps Discards completed / merged",
+ "tooltip": {
+ "shared": false,
+ "sort": 0,
+ "value_type": "individual"
+ },
+ "type": "graph",
+ "xaxis": {
+ "buckets": null,
+ "mode": "time",
+ "name": null,
+ "show": true,
+ "values": []
+ },
+ "yaxes": [
+ {
+ "$$hashKey": "object:2186",
+ "format": "iops",
+ "label": "IOs",
+ "logBase": 1,
+ "max": null,
+ "min": null,
+ "show": true
+ },
+ {
+ "$$hashKey": "object:2187",
+ "format": "short",
+ "label": null,
+ "logBase": 1,
+ "max": null,
+ "min": null,
+ "show": false
+ }
+ ],
+ "yaxis": {
+ "align": false,
+ "alignLevel": null
+ }
+ }
+ ],
+ "repeat": null,
+ "title": "Storage Disk",
+ "type": "row"
+ },
+ {
+ "collapsed": true,
+ "datasource": "${DS_PROMETHEUS}",
+ "gridPos": {
+ "h": 1,
+ "w": 24,
+ "x": 0,
+ "y": 29
+ },
+ "id": 271,
+ "panels": [
+ {
+ "aliasColors": {},
+ "bars": false,
+ "dashLength": 10,
+ "dashes": false,
+ "datasource": "${DS_PROMETHEUS}",
+ "decimals": 3,
+ "description": "",
+ "fill": 2,
+ "fillGradient": 0,
+ "gridPos": {
+ "h": 10,
+ "w": 12,
+ "x": 0,
+ "y": 78
+ },
+ "hiddenSeries": false,
+ "id": 43,
+ "legend": {
+ "alignAsTable": true,
+ "avg": true,
+ "current": true,
+ "max": true,
+ "min": true,
+ "rightSide": false,
+ "show": true,
+ "total": false,
+ "values": true
+ },
+ "lines": true,
+ "linewidth": 1,
+ "links": [],
+ "maxPerRow": 6,
+ "nullPointMode": "null",
+ "options": {
+ "dataLinks": []
+ },
+ "percentage": false,
+ "pointradius": 5,
+ "points": false,
+ "renderer": "flot",
+ "seriesOverrides": [],
+ "spaceLength": 10,
+ "stack": false,
+ "steppedLine": false,
+ "targets": [
+ {
+ "expr": "node_filesystem_avail_bytes{instance=\"$node\",job=\"$job\",device!~'rootfs'}",
+ "format": "time_series",
+ "hide": false,
+ "intervalFactor": 2,
+ "legendFormat": "{{mountpoint}} - Available",
+ "metric": "",
+ "refId": "A",
+ "step": 4
+ },
+ {
+ "expr": "node_filesystem_free_bytes{instance=\"$node\",job=\"$job\",device!~'rootfs'}",
+ "format": "time_series",
+ "hide": true,
+ "intervalFactor": 2,
+ "legendFormat": "{{mountpoint}} - Free",
+ "refId": "B",
+ "step": 2
+ },
+ {
+ "expr": "node_filesystem_size_bytes{instance=\"$node\",job=\"$job\",device!~'rootfs'}",
+ "format": "time_series",
+ "hide": true,
+ "intervalFactor": 2,
+ "legendFormat": "{{mountpoint}} - Size",
+ "refId": "C",
+ "step": 2
+ }
+ ],
+ "thresholds": [],
+ "timeFrom": null,
+ "timeRegions": [],
+ "timeShift": null,
+ "title": "Filesystem space available",
+ "tooltip": {
+ "shared": true,
+ "sort": 0,
+ "value_type": "individual"
+ },
+ "type": "graph",
+ "xaxis": {
+ "buckets": null,
+ "mode": "time",
+ "name": null,
+ "show": true,
+ "values": []
+ },
+ "yaxes": [
+ {
+ "$$hashKey": "object:3826",
+ "format": "bytes",
+ "label": "bytes",
+ "logBase": 1,
+ "max": null,
+ "min": "0",
+ "show": true
+ },
+ {
+ "$$hashKey": "object:3827",
+ "format": "short",
+ "label": null,
+ "logBase": 1,
+ "max": null,
+ "min": null,
+ "show": false
+ }
+ ],
+ "yaxis": {
+ "align": false,
+ "alignLevel": null
+ }
+ },
+ {
+ "aliasColors": {},
+ "bars": false,
+ "dashLength": 10,
+ "dashes": false,
+ "datasource": "${DS_PROMETHEUS}",
+ "description": "",
+ "fill": 2,
+ "fillGradient": 0,
+ "gridPos": {
+ "h": 10,
+ "w": 12,
+ "x": 12,
+ "y": 78
+ },
+ "hiddenSeries": false,
+ "id": 41,
+ "legend": {
+ "alignAsTable": true,
+ "avg": true,
+ "current": true,
+ "hideZero": true,
+ "max": true,
+ "min": true,
+ "rightSide": false,
+ "show": true,
+ "total": false,
+ "values": true
+ },
+ "lines": true,
+ "linewidth": 1,
+ "links": [],
+ "nullPointMode": "null",
+ "options": {
+ "dataLinks": []
+ },
+ "percentage": false,
+ "pointradius": 5,
+ "points": false,
+ "renderer": "flot",
+ "seriesOverrides": [],
+ "spaceLength": 10,
+ "stack": false,
+ "steppedLine": false,
+ "targets": [
+ {
+ "expr": "node_filesystem_files_free{instance=\"$node\",job=\"$job\",device!~'rootfs'}",
+ "format": "time_series",
+ "hide": false,
+ "intervalFactor": 2,
+ "legendFormat": "{{mountpoint}} - Free file nodes",
+ "refId": "A",
+ "step": 4
+ }
+ ],
+ "thresholds": [],
+ "timeFrom": null,
+ "timeRegions": [],
+ "timeShift": null,
+ "title": "File Nodes Free",
+ "tooltip": {
+ "shared": true,
+ "sort": 0,
+ "value_type": "individual"
+ },
+ "type": "graph",
+ "xaxis": {
+ "buckets": null,
+ "mode": "time",
+ "name": null,
+ "show": true,
+ "values": []
+ },
+ "yaxes": [
+ {
+ "$$hashKey": "object:3894",
+ "format": "short",
+ "label": "file nodes",
+ "logBase": 1,
+ "max": null,
+ "min": "0",
+ "show": true
+ },
+ {
+ "$$hashKey": "object:3895",
+ "format": "short",
+ "label": null,
+ "logBase": 1,
+ "max": null,
+ "min": null,
+ "show": false
+ }
+ ],
+ "yaxis": {
+ "align": false,
+ "alignLevel": null
+ }
+ },
+ {
+ "aliasColors": {},
+ "bars": false,
+ "dashLength": 10,
+ "dashes": false,
+ "datasource": "${DS_PROMETHEUS}",
+ "description": "",
+ "fill": 2,
+ "fillGradient": 0,
+ "gridPos": {
+ "h": 10,
+ "w": 12,
+ "x": 0,
+ "y": 88
+ },
+ "hiddenSeries": false,
+ "id": 28,
+ "legend": {
+ "alignAsTable": true,
+ "avg": true,
+ "current": true,
+ "max": true,
+ "min": true,
+ "show": true,
+ "total": false,
+ "values": true
+ },
+ "lines": true,
+ "linewidth": 1,
+ "links": [],
+ "maxPerRow": 6,
+ "nullPointMode": "null",
+ "options": {
+ "dataLinks": []
+ },
+ "percentage": false,
+ "pointradius": 5,
+ "points": false,
+ "renderer": "flot",
+ "seriesOverrides": [],
+ "spaceLength": 10,
+ "stack": false,
+ "steppedLine": false,
+ "targets": [
+ {
+ "expr": "node_filefd_maximum{instance=\"$node\",job=\"$job\"}",
+ "format": "time_series",
+ "intervalFactor": 4,
+ "legendFormat": "Max open files",
+ "refId": "A",
+ "step": 8
+ },
+ {
+ "expr": "node_filefd_allocated{instance=\"$node\",job=\"$job\"}",
+ "format": "time_series",
+ "intervalFactor": 2,
+ "legendFormat": "Open files",
+ "refId": "B",
+ "step": 4
+ }
+ ],
+ "thresholds": [],
+ "timeFrom": null,
+ "timeRegions": [],
+ "timeShift": null,
+ "title": "File Descriptor",
+ "tooltip": {
+ "shared": false,
+ "sort": 0,
+ "value_type": "individual"
+ },
+ "type": "graph",
+ "xaxis": {
+ "buckets": null,
+ "mode": "time",
+ "name": null,
+ "show": true,
+ "values": []
+ },
+ "yaxes": [
+ {
+ "format": "short",
+ "label": "files",
+ "logBase": 1,
+ "max": null,
+ "min": "0",
+ "show": true
+ },
+ {
+ "format": "short",
+ "label": null,
+ "logBase": 1,
+ "max": null,
+ "min": null,
+ "show": false
+ }
+ ],
+ "yaxis": {
+ "align": false,
+ "alignLevel": null
+ }
+ },
+ {
+ "aliasColors": {},
+ "bars": false,
+ "dashLength": 10,
+ "dashes": false,
+ "datasource": "${DS_PROMETHEUS}",
+ "description": "",
+ "fill": 2,
+ "fillGradient": 0,
+ "gridPos": {
+ "h": 10,
+ "w": 12,
+ "x": 12,
+ "y": 88
+ },
+ "hiddenSeries": false,
+ "id": 219,
+ "legend": {
+ "alignAsTable": true,
+ "avg": true,
+ "current": true,
+ "hideZero": true,
+ "max": true,
+ "min": true,
+ "rightSide": false,
+ "show": true,
+ "total": false,
+ "values": true
+ },
+ "lines": true,
+ "linewidth": 1,
+ "links": [],
+ "nullPointMode": "null",
+ "options": {
+ "dataLinks": []
+ },
+ "percentage": false,
+ "pointradius": 5,
+ "points": false,
+ "renderer": "flot",
+ "seriesOverrides": [],
+ "spaceLength": 10,
+ "stack": false,
+ "steppedLine": false,
+ "targets": [
+ {
+ "expr": "node_filesystem_files{instance=\"$node\",job=\"$job\",device!~'rootfs'}",
+ "format": "time_series",
+ "hide": false,
+ "intervalFactor": 2,
+ "legendFormat": "{{mountpoint}} - File nodes total",
+ "refId": "A",
+ "step": 4
+ }
+ ],
+ "thresholds": [],
+ "timeFrom": null,
+ "timeRegions": [],
+ "timeShift": null,
+ "title": "File Nodes Size",
+ "tooltip": {
+ "shared": true,
+ "sort": 0,
+ "value_type": "individual"
+ },
+ "type": "graph",
+ "xaxis": {
+ "buckets": null,
+ "mode": "time",
+ "name": null,
+ "show": true,
+ "values": []
+ },
+ "yaxes": [
+ {
+ "format": "short",
+ "label": "file Nodes",
+ "logBase": 1,
+ "max": null,
+ "min": "0",
+ "show": true
+ },
+ {
+ "format": "short",
+ "label": null,
+ "logBase": 1,
+ "max": null,
+ "min": null,
+ "show": false
+ }
+ ],
+ "yaxis": {
+ "align": false,
+ "alignLevel": null
+ }
+ },
+ {
+ "aliasColors": {
+ "/ ReadOnly": "#890F02"
+ },
+ "bars": false,
+ "dashLength": 10,
+ "dashes": false,
+ "datasource": "${DS_PROMETHEUS}",
+ "decimals": null,
+ "description": "",
+ "fill": 2,
+ "fillGradient": 0,
+ "gridPos": {
+ "h": 10,
+ "w": 12,
+ "x": 0,
+ "y": 98
+ },
+ "hiddenSeries": false,
+ "id": 44,
+ "legend": {
+ "alignAsTable": true,
+ "avg": true,
+ "current": true,
+ "hideEmpty": true,
+ "hideZero": true,
+ "max": true,
+ "min": true,
+ "rightSide": false,
+ "show": true,
+ "total": false,
+ "values": true
+ },
+ "lines": true,
+ "linewidth": 1,
+ "links": [],
+ "maxPerRow": 6,
+ "nullPointMode": "null",
+ "options": {
+ "dataLinks": []
+ },
+ "percentage": false,
+ "pointradius": 5,
+ "points": false,
+ "renderer": "flot",
+ "seriesOverrides": [],
+ "spaceLength": 10,
+ "stack": true,
+ "steppedLine": false,
+ "targets": [
+ {
+ "expr": "node_filesystem_readonly{instance=\"$node\",job=\"$job\",device!~'rootfs'}",
+ "format": "time_series",
+ "intervalFactor": 2,
+ "legendFormat": "{{mountpoint}} - ReadOnly",
+ "refId": "A",
+ "step": 4
+ },
+ {
+ "expr": "node_filesystem_device_error{instance=\"$node\",job=\"$job\",device!~'rootfs',fstype!~'tmpfs'}",
+ "format": "time_series",
+ "interval": "",
+ "intervalFactor": 2,
+ "legendFormat": "{{mountpoint}} - Device error",
+ "refId": "B",
+ "step": 4
+ }
+ ],
+ "thresholds": [],
+ "timeFrom": null,
+ "timeRegions": [],
+ "timeShift": null,
+ "title": "Filesystem in ReadOnly / Error",
+ "tooltip": {
+ "shared": true,
+ "sort": 0,
+ "value_type": "individual"
+ },
+ "type": "graph",
+ "xaxis": {
+ "buckets": null,
+ "mode": "time",
+ "name": null,
+ "show": true,
+ "values": []
+ },
+ "yaxes": [
+ {
+ "$$hashKey": "object:3670",
+ "format": "short",
+ "label": "counter",
+ "logBase": 1,
+ "max": "1",
+ "min": "0",
+ "show": true
+ },
+ {
+ "$$hashKey": "object:3671",
+ "format": "short",
+ "label": null,
+ "logBase": 1,
+ "max": null,
+ "min": null,
+ "show": false
+ }
+ ],
+ "yaxis": {
+ "align": false,
+ "alignLevel": null
+ }
+ }
+ ],
+ "repeat": null,
+ "title": "Storage Filesystem",
+ "type": "row"
+ },
+ {
+ "collapsed": true,
+ "datasource": "${DS_PROMETHEUS}",
+ "gridPos": {
+ "h": 1,
+ "w": 24,
+ "x": 0,
+ "y": 30
+ },
+ "id": 272,
+ "panels": [
+ {
+ "aliasColors": {
+ "receive_packets_eth0": "#7EB26D",
+ "receive_packets_lo": "#E24D42",
+ "transmit_packets_eth0": "#7EB26D",
+ "transmit_packets_lo": "#E24D42"
+ },
+ "bars": false,
+ "dashLength": 10,
+ "dashes": false,
+ "datasource": "${DS_PROMETHEUS}",
+ "fill": 2,
+ "fillGradient": 0,
+ "gridPos": {
+ "h": 10,
+ "w": 12,
+ "x": 0,
+ "y": 30
+ },
+ "hiddenSeries": false,
+ "id": 60,
+ "legend": {
+ "alignAsTable": true,
+ "avg": true,
+ "current": true,
+ "max": true,
+ "min": true,
+ "rightSide": false,
+ "show": true,
+ "sideWidth": 300,
+ "total": false,
+ "values": true
+ },
+ "lines": true,
+ "linewidth": 1,
+ "links": [],
+ "nullPointMode": "null",
+ "options": {
+ "dataLinks": []
+ },
+ "percentage": false,
+ "pointradius": 5,
+ "points": false,
+ "renderer": "flot",
+ "seriesOverrides": [
+ {
+ "alias": "/.*Trans.*/",
+ "transform": "negative-Y"
+ }
+ ],
+ "spaceLength": 10,
+ "stack": false,
+ "steppedLine": false,
+ "targets": [
+ {
+ "expr": "irate(node_network_receive_packets_total{instance=\"$node\",job=\"$job\"}[5m])",
+ "format": "time_series",
+ "interval": "",
+ "intervalFactor": 2,
+ "legendFormat": "{{device}} - Receive",
+ "refId": "A",
+ "step": 4
+ },
+ {
+ "expr": "irate(node_network_transmit_packets_total{instance=\"$node\",job=\"$job\"}[5m])",
+ "format": "time_series",
+ "interval": "",
+ "intervalFactor": 2,
+ "legendFormat": "{{device}} - Transmit",
+ "refId": "B",
+ "step": 4
+ }
+ ],
+ "thresholds": [],
+ "timeFrom": null,
+ "timeRegions": [],
+ "timeShift": null,
+ "title": "Network Traffic by Packets",
+ "tooltip": {
+ "shared": true,
+ "sort": 0,
+ "value_type": "individual"
+ },
+ "type": "graph",
+ "xaxis": {
+ "buckets": null,
+ "mode": "time",
+ "name": null,
+ "show": true,
+ "values": []
+ },
+ "yaxes": [
+ {
+ "format": "pps",
+ "label": "packets out (-) / in (+)",
+ "logBase": 1,
+ "max": null,
+ "min": null,
+ "show": true
+ },
+ {
+ "format": "short",
+ "label": null,
+ "logBase": 1,
+ "max": null,
+ "min": null,
+ "show": false
+ }
+ ],
+ "yaxis": {
+ "align": false,
+ "alignLevel": null
+ }
+ },
+ {
+ "aliasColors": {},
+ "bars": false,
+ "dashLength": 10,
+ "dashes": false,
+ "datasource": "${DS_PROMETHEUS}",
+ "fill": 2,
+ "fillGradient": 0,
+ "gridPos": {
+ "h": 10,
+ "w": 12,
+ "x": 12,
+ "y": 30
+ },
+ "hiddenSeries": false,
+ "id": 142,
+ "legend": {
+ "alignAsTable": true,
+ "avg": true,
+ "current": true,
+ "hideEmpty": false,
+ "hideZero": false,
+ "max": true,
+ "min": true,
+ "rightSide": false,
+ "show": true,
+ "sideWidth": 300,
+ "sort": "current",
+ "sortDesc": true,
+ "total": false,
+ "values": true
+ },
+ "lines": true,
+ "linewidth": 1,
+ "links": [],
+ "nullPointMode": "null",
+ "options": {
+ "dataLinks": []
+ },
+ "percentage": false,
+ "pointradius": 5,
+ "points": false,
+ "renderer": "flot",
+ "seriesOverrides": [
+ {
+ "alias": "/.*Trans.*/",
+ "transform": "negative-Y"
+ }
+ ],
+ "spaceLength": 10,
+ "stack": false,
+ "steppedLine": false,
+ "targets": [
+ {
+ "expr": "irate(node_network_receive_errs_total{instance=\"$node\",job=\"$job\"}[5m])",
+ "format": "time_series",
+ "intervalFactor": 2,
+ "legendFormat": "{{device}} - Receive errors",
+ "refId": "A",
+ "step": 4
+ },
+ {
+ "expr": "irate(node_network_transmit_errs_total{instance=\"$node\",job=\"$job\"}[5m])",
+ "format": "time_series",
+ "intervalFactor": 2,
+ "legendFormat": "{{device}} - Rransmit errors",
+ "refId": "B",
+ "step": 4
+ }
+ ],
+ "thresholds": [],
+ "timeFrom": null,
+ "timeRegions": [],
+ "timeShift": null,
+ "title": "Network Traffic Errors",
+ "tooltip": {
+ "shared": true,
+ "sort": 0,
+ "value_type": "individual"
+ },
+ "type": "graph",
+ "xaxis": {
+ "buckets": null,
+ "mode": "time",
+ "name": null,
+ "show": true,
+ "values": []
+ },
+ "yaxes": [
+ {
+ "format": "pps",
+ "label": "packets out (-) / in (+)",
+ "logBase": 1,
+ "max": null,
+ "min": null,
+ "show": true
+ },
+ {
+ "format": "short",
+ "label": null,
+ "logBase": 1,
+ "max": null,
+ "min": null,
+ "show": false
+ }
+ ],
+ "yaxis": {
+ "align": false,
+ "alignLevel": null
+ }
+ },
+ {
+ "aliasColors": {},
+ "bars": false,
+ "dashLength": 10,
+ "dashes": false,
+ "datasource": "${DS_PROMETHEUS}",
+ "fill": 2,
+ "fillGradient": 0,
+ "gridPos": {
+ "h": 10,
+ "w": 12,
+ "x": 0,
+ "y": 40
+ },
+ "hiddenSeries": false,
+ "id": 143,
+ "legend": {
+ "alignAsTable": true,
+ "avg": true,
+ "current": true,
+ "hideEmpty": false,
+ "hideZero": false,
+ "max": true,
+ "min": true,
+ "rightSide": false,
+ "show": true,
+ "sideWidth": 300,
+ "sort": "current",
+ "sortDesc": true,
+ "total": false,
+ "values": true
+ },
+ "lines": true,
+ "linewidth": 1,
+ "links": [],
+ "nullPointMode": "null",
+ "options": {
+ "dataLinks": []
+ },
+ "percentage": false,
+ "pointradius": 5,
+ "points": false,
+ "renderer": "flot",
+ "seriesOverrides": [
+ {
+ "alias": "/.*Trans.*/",
+ "transform": "negative-Y"
+ }
+ ],
+ "spaceLength": 10,
+ "stack": false,
+ "steppedLine": false,
+ "targets": [
+ {
+ "expr": "irate(node_network_receive_drop_total{instance=\"$node\",job=\"$job\"}[5m])",
+ "format": "time_series",
+ "intervalFactor": 2,
+ "legendFormat": "{{device}} - Receive drop",
+ "refId": "A",
+ "step": 4
+ },
+ {
+ "expr": "irate(node_network_transmit_drop_total{instance=\"$node\",job=\"$job\"}[5m])",
+ "format": "time_series",
+ "intervalFactor": 2,
+ "legendFormat": "{{device}} - Transmit drop",
+ "refId": "B",
+ "step": 4
+ }
+ ],
+ "thresholds": [],
+ "timeFrom": null,
+ "timeRegions": [],
+ "timeShift": null,
+ "title": "Network Traffic Drop",
+ "tooltip": {
+ "shared": true,
+ "sort": 0,
+ "value_type": "individual"
+ },
+ "type": "graph",
+ "xaxis": {
+ "buckets": null,
+ "mode": "time",
+ "name": null,
+ "show": true,
+ "values": []
+ },
+ "yaxes": [
+ {
+ "format": "pps",
+ "label": "packets out (-) / in (+)",
+ "logBase": 1,
+ "max": null,
+ "min": null,
+ "show": true
+ },
+ {
+ "format": "short",
+ "label": null,
+ "logBase": 1,
+ "max": null,
+ "min": null,
+ "show": false
+ }
+ ],
+ "yaxis": {
+ "align": false,
+ "alignLevel": null
+ }
+ },
+ {
+ "aliasColors": {},
+ "bars": false,
+ "dashLength": 10,
+ "dashes": false,
+ "datasource": "${DS_PROMETHEUS}",
+ "fill": 2,
+ "fillGradient": 0,
+ "gridPos": {
+ "h": 10,
+ "w": 12,
+ "x": 12,
+ "y": 40
+ },
+ "hiddenSeries": false,
+ "id": 141,
+ "legend": {
+ "alignAsTable": true,
+ "avg": true,
+ "current": true,
+ "hideEmpty": false,
+ "hideZero": false,
+ "max": true,
+ "min": true,
+ "rightSide": false,
+ "show": true,
+ "sideWidth": 300,
+ "sort": "current",
+ "sortDesc": true,
+ "total": false,
+ "values": true
+ },
+ "lines": true,
+ "linewidth": 1,
+ "links": [],
+ "nullPointMode": "null",
+ "options": {
+ "dataLinks": []
+ },
+ "percentage": false,
+ "pointradius": 5,
+ "points": false,
+ "renderer": "flot",
+ "seriesOverrides": [
+ {
+ "alias": "/.*Trans.*/",
+ "transform": "negative-Y"
+ }
+ ],
+ "spaceLength": 10,
+ "stack": false,
+ "steppedLine": false,
+ "targets": [
+ {
+ "expr": "irate(node_network_receive_compressed_total{instance=\"$node\",job=\"$job\"}[5m])",
+ "format": "time_series",
+ "intervalFactor": 2,
+ "legendFormat": "{{device}} - Receive compressed",
+ "refId": "A",
+ "step": 4
+ },
+ {
+ "expr": "irate(node_network_transmit_compressed_total{instance=\"$node\",job=\"$job\"}[5m])",
+ "format": "time_series",
+ "intervalFactor": 2,
+ "legendFormat": "{{device}} - Transmit compressed",
+ "refId": "B",
+ "step": 4
+ }
+ ],
+ "thresholds": [],
+ "timeFrom": null,
+ "timeRegions": [],
+ "timeShift": null,
+ "title": "Network Traffic Compressed",
+ "tooltip": {
+ "shared": true,
+ "sort": 0,
+ "value_type": "individual"
+ },
+ "type": "graph",
+ "xaxis": {
+ "buckets": null,
+ "mode": "time",
+ "name": null,
+ "show": true,
+ "values": []
+ },
+ "yaxes": [
+ {
+ "format": "pps",
+ "label": "packets out (-) / in (+)",
+ "logBase": 1,
+ "max": null,
+ "min": null,
+ "show": true
+ },
+ {
+ "format": "short",
+ "label": null,
+ "logBase": 1,
+ "max": null,
+ "min": null,
+ "show": false
+ }
+ ],
+ "yaxis": {
+ "align": false,
+ "alignLevel": null
+ }
+ },
+ {
+ "aliasColors": {},
+ "bars": false,
+ "dashLength": 10,
+ "dashes": false,
+ "datasource": "${DS_PROMETHEUS}",
+ "fill": 2,
+ "fillGradient": 0,
+ "gridPos": {
+ "h": 10,
+ "w": 12,
+ "x": 0,
+ "y": 50
+ },
+ "hiddenSeries": false,
+ "id": 146,
+ "legend": {
+ "alignAsTable": true,
+ "avg": true,
+ "current": true,
+ "hideEmpty": false,
+ "hideZero": false,
+ "max": true,
+ "min": true,
+ "rightSide": false,
+ "show": true,
+ "sideWidth": 300,
+ "sort": "current",
+ "sortDesc": true,
+ "total": false,
+ "values": true
+ },
+ "lines": true,
+ "linewidth": 1,
+ "links": [],
+ "nullPointMode": "null",
+ "options": {
+ "dataLinks": []
+ },
+ "percentage": false,
+ "pointradius": 5,
+ "points": false,
+ "renderer": "flot",
+ "seriesOverrides": [
+ {
+ "alias": "/.*Trans.*/",
+ "transform": "negative-Y"
+ }
+ ],
+ "spaceLength": 10,
+ "stack": false,
+ "steppedLine": false,
+ "targets": [
+ {
+ "expr": "irate(node_network_receive_multicast_total{instance=\"$node\",job=\"$job\"}[5m])",
+ "format": "time_series",
+ "intervalFactor": 2,
+ "legendFormat": "{{device}} - Receive multicast",
+ "refId": "A",
+ "step": 4
+ }
+ ],
+ "thresholds": [],
+ "timeFrom": null,
+ "timeRegions": [],
+ "timeShift": null,
+ "title": "Network Traffic Multicast",
+ "tooltip": {
+ "shared": true,
+ "sort": 0,
+ "value_type": "individual"
+ },
+ "type": "graph",
+ "xaxis": {
+ "buckets": null,
+ "mode": "time",
+ "name": null,
+ "show": true,
+ "values": []
+ },
+ "yaxes": [
+ {
+ "format": "pps",
+ "label": "packets out (-) / in (+)",
+ "logBase": 1,
+ "max": null,
+ "min": null,
+ "show": true
+ },
+ {
+ "format": "short",
+ "label": null,
+ "logBase": 1,
+ "max": null,
+ "min": null,
+ "show": false
+ }
+ ],
+ "yaxis": {
+ "align": false,
+ "alignLevel": null
+ }
+ },
+ {
+ "aliasColors": {},
+ "bars": false,
+ "dashLength": 10,
+ "dashes": false,
+ "datasource": "${DS_PROMETHEUS}",
+ "fill": 2,
+ "fillGradient": 0,
+ "gridPos": {
+ "h": 10,
+ "w": 12,
+ "x": 12,
+ "y": 50
+ },
+ "hiddenSeries": false,
+ "id": 144,
+ "legend": {
+ "alignAsTable": true,
+ "avg": true,
+ "current": true,
+ "hideEmpty": false,
+ "hideZero": false,
+ "max": true,
+ "min": true,
+ "rightSide": false,
+ "show": true,
+ "sideWidth": 300,
+ "sort": "current",
+ "sortDesc": true,
+ "total": false,
+ "values": true
+ },
+ "lines": true,
+ "linewidth": 1,
+ "links": [],
+ "nullPointMode": "null",
+ "options": {
+ "dataLinks": []
+ },
+ "percentage": false,
+ "pointradius": 5,
+ "points": false,
+ "renderer": "flot",
+ "seriesOverrides": [
+ {
+ "alias": "/.*Trans.*/",
+ "transform": "negative-Y"
+ }
+ ],
+ "spaceLength": 10,
+ "stack": false,
+ "steppedLine": false,
+ "targets": [
+ {
+ "expr": "irate(node_network_receive_fifo_total{instance=\"$node\",job=\"$job\"}[5m])",
+ "format": "time_series",
+ "intervalFactor": 2,
+ "legendFormat": "{{device}} - Receive fifo",
+ "refId": "A",
+ "step": 4
+ },
+ {
+ "expr": "irate(node_network_transmit_fifo_total{instance=\"$node\",job=\"$job\"}[5m])",
+ "format": "time_series",
+ "intervalFactor": 2,
+ "legendFormat": "{{device}} - Transmit fifo",
+ "refId": "B",
+ "step": 4
+ }
+ ],
+ "thresholds": [],
+ "timeFrom": null,
+ "timeRegions": [],
+ "timeShift": null,
+ "title": "Network Traffic Fifo",
+ "tooltip": {
+ "shared": true,
+ "sort": 0,
+ "value_type": "individual"
+ },
+ "type": "graph",
+ "xaxis": {
+ "buckets": null,
+ "mode": "time",
+ "name": null,
+ "show": true,
+ "values": []
+ },
+ "yaxes": [
+ {
+ "format": "pps",
+ "label": "packets out (-) / in (+)",
+ "logBase": 1,
+ "max": null,
+ "min": null,
+ "show": true
+ },
+ {
+ "format": "short",
+ "label": null,
+ "logBase": 1,
+ "max": null,
+ "min": null,
+ "show": false
+ }
+ ],
+ "yaxis": {
+ "align": false,
+ "alignLevel": null
+ }
+ },
+ {
+ "aliasColors": {},
+ "bars": false,
+ "dashLength": 10,
+ "dashes": false,
+ "datasource": "${DS_PROMETHEUS}",
+ "fill": 2,
+ "fillGradient": 0,
+ "gridPos": {
+ "h": 10,
+ "w": 12,
+ "x": 0,
+ "y": 60
+ },
+ "hiddenSeries": false,
+ "id": 145,
+ "legend": {
+ "alignAsTable": true,
+ "avg": true,
+ "current": true,
+ "hideEmpty": false,
+ "hideZero": false,
+ "max": true,
+ "min": true,
+ "rightSide": false,
+ "show": true,
+ "sideWidth": 300,
+ "sort": "current",
+ "sortDesc": true,
+ "total": false,
+ "values": true
+ },
+ "lines": true,
+ "linewidth": 1,
+ "links": [],
+ "nullPointMode": "null",
+ "options": {
+ "dataLinks": []
+ },
+ "percentage": false,
+ "pointradius": 5,
+ "points": false,
+ "renderer": "flot",
+ "seriesOverrides": [
+ {
+ "$$hashKey": "object:576",
+ "alias": "/.*Trans.*/",
+ "transform": "negative-Y"
+ }
+ ],
+ "spaceLength": 10,
+ "stack": false,
+ "steppedLine": false,
+ "targets": [
+ {
+ "expr": "irate(node_network_receive_frame_total{instance=\"$node\",job=\"$job\"}[5m])",
+ "format": "time_series",
+ "hide": false,
+ "intervalFactor": 2,
+ "legendFormat": "{{device}} - Receive frame",
+ "refId": "A",
+ "step": 4
+ }
+ ],
+ "thresholds": [],
+ "timeFrom": null,
+ "timeRegions": [],
+ "timeShift": null,
+ "title": "Network Traffic Frame",
+ "tooltip": {
+ "shared": true,
+ "sort": 0,
+ "value_type": "individual"
+ },
+ "type": "graph",
+ "xaxis": {
+ "buckets": null,
+ "mode": "time",
+ "name": null,
+ "show": true,
+ "values": []
+ },
+ "yaxes": [
+ {
+ "$$hashKey": "object:589",
+ "format": "pps",
+ "label": "packets out (-) / in (+)",
+ "logBase": 1,
+ "max": null,
+ "min": null,
+ "show": true
+ },
+ {
+ "$$hashKey": "object:590",
+ "format": "short",
+ "label": null,
+ "logBase": 1,
+ "max": null,
+ "min": null,
+ "show": false
+ }
+ ],
+ "yaxis": {
+ "align": false,
+ "alignLevel": null
+ }
+ },
+ {
+ "aliasColors": {},
+ "bars": false,
+ "dashLength": 10,
+ "dashes": false,
+ "datasource": "${DS_PROMETHEUS}",
+ "fill": 2,
+ "fillGradient": 0,
+ "gridPos": {
+ "h": 10,
+ "w": 12,
+ "x": 12,
+ "y": 60
+ },
+ "hiddenSeries": false,
+ "id": 231,
+ "legend": {
+ "alignAsTable": true,
+ "avg": true,
+ "current": true,
+ "hideEmpty": false,
+ "hideZero": false,
+ "max": true,
+ "min": true,
+ "rightSide": false,
+ "show": true,
+ "sideWidth": 300,
+ "sort": "current",
+ "sortDesc": true,
+ "total": false,
+ "values": true
+ },
+ "lines": true,
+ "linewidth": 1,
+ "links": [],
+ "nullPointMode": "null",
+ "options": {
+ "dataLinks": []
+ },
+ "percentage": false,
+ "pointradius": 5,
+ "points": false,
+ "renderer": "flot",
+ "seriesOverrides": [],
+ "spaceLength": 10,
+ "stack": false,
+ "steppedLine": false,
+ "targets": [
+ {
+ "expr": "irate(node_network_transmit_carrier_total{instance=\"$node\",job=\"$job\"}[5m])",
+ "format": "time_series",
+ "intervalFactor": 2,
+ "legendFormat": "{{device}} - Statistic transmit_carrier",
+ "refId": "A",
+ "step": 4
+ }
+ ],
+ "thresholds": [],
+ "timeFrom": null,
+ "timeRegions": [],
+ "timeShift": null,
+ "title": "Network Traffic Carrier",
+ "tooltip": {
+ "shared": true,
+ "sort": 0,
+ "value_type": "individual"
+ },
+ "type": "graph",
+ "xaxis": {
+ "buckets": null,
+ "mode": "time",
+ "name": null,
+ "show": true,
+ "values": []
+ },
+ "yaxes": [
+ {
+ "format": "short",
+ "label": "counter",
+ "logBase": 1,
+ "max": null,
+ "min": null,
+ "show": true
+ },
+ {
+ "format": "short",
+ "label": null,
+ "logBase": 1,
+ "max": null,
+ "min": null,
+ "show": false
+ }
+ ],
+ "yaxis": {
+ "align": false,
+ "alignLevel": null
+ }
+ },
+ {
+ "aliasColors": {},
+ "bars": false,
+ "dashLength": 10,
+ "dashes": false,
+ "datasource": "${DS_PROMETHEUS}",
+ "fill": 2,
+ "fillGradient": 0,
+ "gridPos": {
+ "h": 10,
+ "w": 12,
+ "x": 0,
+ "y": 70
+ },
+ "hiddenSeries": false,
+ "id": 232,
+ "legend": {
+ "alignAsTable": true,
+ "avg": true,
+ "current": true,
+ "hideEmpty": false,
+ "hideZero": false,
+ "max": true,
+ "min": true,
+ "rightSide": false,
+ "show": true,
+ "sideWidth": 300,
+ "sort": "current",
+ "sortDesc": true,
+ "total": false,
+ "values": true
+ },
+ "lines": true,
+ "linewidth": 1,
+ "links": [],
+ "nullPointMode": "null",
+ "options": {
+ "dataLinks": []
+ },
+ "percentage": false,
+ "pointradius": 5,
+ "points": false,
+ "renderer": "flot",
+ "seriesOverrides": [
+ {
+ "alias": "/.*Trans.*/",
+ "transform": "negative-Y"
+ }
+ ],
+ "spaceLength": 10,
+ "stack": false,
+ "steppedLine": false,
+ "targets": [
+ {
+ "expr": "irate(node_network_transmit_colls_total{instance=\"$node\",job=\"$job\"}[5m])",
+ "format": "time_series",
+ "intervalFactor": 2,
+ "legendFormat": "{{device}} - Transmit colls",
+ "refId": "A",
+ "step": 4
+ }
+ ],
+ "thresholds": [],
+ "timeFrom": null,
+ "timeRegions": [],
+ "timeShift": null,
+ "title": "Network Traffic Colls",
+ "tooltip": {
+ "shared": true,
+ "sort": 0,
+ "value_type": "individual"
+ },
+ "type": "graph",
+ "xaxis": {
+ "buckets": null,
+ "mode": "time",
+ "name": null,
+ "show": true,
+ "values": []
+ },
+ "yaxes": [
+ {
+ "format": "short",
+ "label": "counter",
+ "logBase": 1,
+ "max": null,
+ "min": null,
+ "show": true
+ },
+ {
+ "format": "short",
+ "label": null,
+ "logBase": 1,
+ "max": null,
+ "min": null,
+ "show": false
+ }
+ ],
+ "yaxis": {
+ "align": false,
+ "alignLevel": null
+ }
+ },
+ {
+ "aliasColors": {},
+ "bars": false,
+ "dashLength": 10,
+ "dashes": false,
+ "datasource": "${DS_PROMETHEUS}",
+ "fill": 2,
+ "fillGradient": 0,
+ "gridPos": {
+ "h": 10,
+ "w": 12,
+ "x": 12,
+ "y": 70
+ },
+ "hiddenSeries": false,
+ "id": 61,
+ "legend": {
+ "alignAsTable": true,
+ "avg": true,
+ "current": true,
+ "max": true,
+ "min": true,
+ "rightSide": false,
+ "show": true,
+ "total": false,
+ "values": true
+ },
+ "lines": true,
+ "linewidth": 1,
+ "links": [],
+ "nullPointMode": "null",
+ "options": {
+ "dataLinks": []
+ },
+ "percentage": false,
+ "pointradius": 5,
+ "points": false,
+ "renderer": "flot",
+ "seriesOverrides": [
+ {
+ "$$hashKey": "object:663",
+ "alias": "NF conntrack limit",
+ "color": "#890F02",
+ "fill": 0
+ }
+ ],
+ "spaceLength": 10,
+ "stack": false,
+ "steppedLine": false,
+ "targets": [
+ {
+ "expr": "node_nf_conntrack_entries{instance=\"$node\",job=\"$job\"}",
+ "format": "time_series",
+ "intervalFactor": 2,
+ "legendFormat": "NF conntrack entries",
+ "refId": "A",
+ "step": 4
+ },
+ {
+ "expr": "node_nf_conntrack_entries_limit{instance=\"$node\",job=\"$job\"}",
+ "format": "time_series",
+ "intervalFactor": 2,
+ "legendFormat": "NF conntrack limit",
+ "refId": "B",
+ "step": 4
+ }
+ ],
+ "thresholds": [],
+ "timeFrom": null,
+ "timeRegions": [],
+ "timeShift": null,
+ "title": "NF Contrack",
+ "tooltip": {
+ "shared": true,
+ "sort": 0,
+ "value_type": "individual"
+ },
+ "type": "graph",
+ "xaxis": {
+ "buckets": null,
+ "mode": "time",
+ "name": null,
+ "show": true,
+ "values": []
+ },
+ "yaxes": [
+ {
+ "$$hashKey": "object:678",
+ "format": "short",
+ "label": "entries",
+ "logBase": 1,
+ "max": null,
+ "min": "0",
+ "show": true
+ },
+ {
+ "$$hashKey": "object:679",
+ "format": "short",
+ "label": null,
+ "logBase": 1,
+ "max": null,
+ "min": null,
+ "show": false
+ }
+ ],
+ "yaxis": {
+ "align": false,
+ "alignLevel": null
+ }
+ },
+ {
+ "aliasColors": {},
+ "bars": false,
+ "dashLength": 10,
+ "dashes": false,
+ "datasource": "${DS_PROMETHEUS}",
+ "fill": 2,
+ "fillGradient": 0,
+ "gridPos": {
+ "h": 10,
+ "w": 12,
+ "x": 0,
+ "y": 80
+ },
+ "hiddenSeries": false,
+ "id": 230,
+ "legend": {
+ "alignAsTable": true,
+ "avg": true,
+ "current": true,
+ "max": true,
+ "min": true,
+ "rightSide": false,
+ "show": true,
+ "total": false,
+ "values": true
+ },
+ "lines": true,
+ "linewidth": 1,
+ "links": [],
+ "nullPointMode": "null",
+ "options": {
+ "dataLinks": []
+ },
+ "percentage": false,
+ "pointradius": 5,
+ "points": false,
+ "renderer": "flot",
+ "seriesOverrides": [],
+ "spaceLength": 10,
+ "stack": false,
+ "steppedLine": false,
+ "targets": [
+ {
+ "expr": "node_arp_entries{instance=\"$node\",job=\"$job\"}",
+ "format": "time_series",
+ "intervalFactor": 2,
+ "legendFormat": "{{ device }} - ARP entries",
+ "refId": "A",
+ "step": 4
+ }
+ ],
+ "thresholds": [],
+ "timeFrom": null,
+ "timeRegions": [],
+ "timeShift": null,
+ "title": "ARP Entries",
+ "tooltip": {
+ "shared": true,
+ "sort": 0,
+ "value_type": "individual"
+ },
+ "type": "graph",
+ "xaxis": {
+ "buckets": null,
+ "mode": "time",
+ "name": null,
+ "show": true,
+ "values": []
+ },
+ "yaxes": [
+ {
+ "format": "short",
+ "label": "Entries",
+ "logBase": 1,
+ "max": null,
+ "min": "0",
+ "show": true
+ },
+ {
+ "format": "short",
+ "label": null,
+ "logBase": 1,
+ "max": null,
+ "min": null,
+ "show": false
+ }
+ ],
+ "yaxis": {
+ "align": false,
+ "alignLevel": null
+ }
+ },
+ {
+ "aliasColors": {},
+ "bars": false,
+ "dashLength": 10,
+ "dashes": false,
+ "datasource": "${DS_PROMETHEUS}",
+ "fill": 2,
+ "fillGradient": 0,
+ "gridPos": {
+ "h": 10,
+ "w": 12,
+ "x": 12,
+ "y": 80
+ },
+ "hiddenSeries": false,
+ "id": 288,
+ "legend": {
+ "alignAsTable": true,
+ "avg": true,
+ "current": true,
+ "max": true,
+ "min": true,
+ "rightSide": false,
+ "show": true,
+ "total": false,
+ "values": true
+ },
+ "lines": true,
+ "linewidth": 1,
+ "links": [],
+ "nullPointMode": "null",
+ "options": {
+ "dataLinks": []
+ },
+ "percentage": false,
+ "pointradius": 1,
+ "points": false,
+ "renderer": "flot",
+ "seriesOverrides": [],
+ "spaceLength": 10,
+ "stack": false,
+ "steppedLine": false,
+ "targets": [
+ {
+ "expr": "node_network_mtu_bytes{instance=\"$node\",job=\"$job\"}",
+ "format": "time_series",
+ "intervalFactor": 2,
+ "legendFormat": "{{ device }} - Bytes",
+ "refId": "A",
+ "step": 4
+ }
+ ],
+ "thresholds": [],
+ "timeFrom": null,
+ "timeRegions": [],
+ "timeShift": null,
+ "title": "MTU",
+ "tooltip": {
+ "shared": true,
+ "sort": 0,
+ "value_type": "individual"
+ },
+ "type": "graph",
+ "xaxis": {
+ "buckets": null,
+ "mode": "time",
+ "name": null,
+ "show": true,
+ "values": []
+ },
+ "yaxes": [
+ {
+ "decimals": 0,
+ "format": "bytes",
+ "label": "bytes",
+ "logBase": 1,
+ "max": null,
+ "min": "0",
+ "show": true
+ },
+ {
+ "format": "short",
+ "label": null,
+ "logBase": 1,
+ "max": null,
+ "min": null,
+ "show": false
+ }
+ ],
+ "yaxis": {
+ "align": false,
+ "alignLevel": null
+ }
+ },
+ {
+ "aliasColors": {},
+ "bars": false,
+ "dashLength": 10,
+ "dashes": false,
+ "datasource": "${DS_PROMETHEUS}",
+ "fill": 2,
+ "fillGradient": 0,
+ "gridPos": {
+ "h": 10,
+ "w": 12,
+ "x": 0,
+ "y": 90
+ },
+ "hiddenSeries": false,
+ "id": 280,
+ "legend": {
+ "alignAsTable": true,
+ "avg": true,
+ "current": true,
+ "max": true,
+ "min": true,
+ "rightSide": false,
+ "show": true,
+ "total": false,
+ "values": true
+ },
+ "lines": true,
+ "linewidth": 1,
+ "links": [],
+ "nullPointMode": "null",
+ "options": {
+ "dataLinks": []
+ },
+ "percentage": false,
+ "pointradius": 1,
+ "points": false,
+ "renderer": "flot",
+ "seriesOverrides": [],
+ "spaceLength": 10,
+ "stack": false,
+ "steppedLine": false,
+ "targets": [
+ {
+ "expr": "node_network_speed_bytes{instance=\"$node\",job=\"$job\"}",
+ "format": "time_series",
+ "intervalFactor": 2,
+ "legendFormat": "{{ device }} - Speed",
+ "refId": "A",
+ "step": 4
+ }
+ ],
+ "thresholds": [],
+ "timeFrom": null,
+ "timeRegions": [],
+ "timeShift": null,
+ "title": "Speed",
+ "tooltip": {
+ "shared": true,
+ "sort": 0,
+ "value_type": "individual"
+ },
+ "type": "graph",
+ "xaxis": {
+ "buckets": null,
+ "mode": "time",
+ "name": null,
+ "show": true,
+ "values": []
+ },
+ "yaxes": [
+ {
+ "decimals": 0,
+ "format": "bytes",
+ "label": "bytes",
+ "logBase": 1,
+ "max": null,
+ "min": "0",
+ "show": true
+ },
+ {
+ "format": "short",
+ "label": null,
+ "logBase": 1,
+ "max": null,
+ "min": null,
+ "show": false
+ }
+ ],
+ "yaxis": {
+ "align": false,
+ "alignLevel": null
+ }
+ },
+ {
+ "aliasColors": {},
+ "bars": false,
+ "dashLength": 10,
+ "dashes": false,
+ "datasource": "${DS_PROMETHEUS}",
+ "fill": 2,
+ "fillGradient": 0,
+ "gridPos": {
+ "h": 10,
+ "w": 12,
+ "x": 12,
+ "y": 90
+ },
+ "hiddenSeries": false,
+ "id": 289,
+ "legend": {
+ "alignAsTable": true,
+ "avg": true,
+ "current": true,
+ "max": true,
+ "min": true,
+ "rightSide": false,
+ "show": true,
+ "total": false,
+ "values": true
+ },
+ "lines": true,
+ "linewidth": 1,
+ "links": [],
+ "nullPointMode": "null",
+ "options": {
+ "dataLinks": []
+ },
+ "percentage": false,
+ "pointradius": 1,
+ "points": false,
+ "renderer": "flot",
+ "seriesOverrides": [],
+ "spaceLength": 10,
+ "stack": false,
+ "steppedLine": false,
+ "targets": [
+ {
+ "expr": "node_network_transmit_queue_length{instance=\"$node\",job=\"$job\"}",
+ "format": "time_series",
+ "intervalFactor": 2,
+ "legendFormat": "{{ device }} - Interface transmit queue length",
+ "refId": "A",
+ "step": 4
+ }
+ ],
+ "thresholds": [],
+ "timeFrom": null,
+ "timeRegions": [],
+ "timeShift": null,
+ "title": "Queue Length",
+ "tooltip": {
+ "shared": true,
+ "sort": 0,
+ "value_type": "individual"
+ },
+ "type": "graph",
+ "xaxis": {
+ "buckets": null,
+ "mode": "time",
+ "name": null,
+ "show": true,
+ "values": []
+ },
+ "yaxes": [
+ {
+ "decimals": 0,
+ "format": "none",
+ "label": "packets",
+ "logBase": 1,
+ "max": null,
+ "min": "0",
+ "show": true
+ },
+ {
+ "format": "short",
+ "label": null,
+ "logBase": 1,
+ "max": null,
+ "min": null,
+ "show": false
+ }
+ ],
+ "yaxis": {
+ "align": false,
+ "alignLevel": null
+ }
+ },
+ {
+ "aliasColors": {},
+ "bars": false,
+ "dashLength": 10,
+ "dashes": false,
+ "datasource": "${DS_PROMETHEUS}",
+ "fill": 2,
+ "fillGradient": 0,
+ "gridPos": {
+ "h": 10,
+ "w": 12,
+ "x": 0,
+ "y": 100
+ },
+ "hiddenSeries": false,
+ "id": 290,
+ "legend": {
+ "alignAsTable": true,
+ "avg": true,
+ "current": true,
+ "hideEmpty": false,
+ "hideZero": false,
+ "max": true,
+ "min": true,
+ "rightSide": false,
+ "show": true,
+ "sideWidth": 300,
+ "sort": "current",
+ "sortDesc": true,
+ "total": false,
+ "values": true
+ },
+ "lines": true,
+ "linewidth": 1,
+ "links": [],
+ "nullPointMode": "null",
+ "options": {
+ "dataLinks": []
+ },
+ "percentage": false,
+ "pointradius": 5,
+ "points": false,
+ "renderer": "flot",
+ "seriesOverrides": [
+ {
+ "$$hashKey": "object:232",
+ "alias": "/.*Dropped.*/",
+ "transform": "negative-Y"
+ }
+ ],
+ "spaceLength": 10,
+ "stack": false,
+ "steppedLine": false,
+ "targets": [
+ {
+ "expr": "irate(node_softnet_processed_total{instance=\"$node\",job=\"$job\"}[5m])",
+ "format": "time_series",
+ "interval": "",
+ "intervalFactor": 2,
+ "legendFormat": "CPU {{cpu}} - Processed",
+ "refId": "A",
+ "step": 4
+ },
+ {
+ "expr": "irate(node_softnet_dropped_total{instance=\"$node\",job=\"$job\"}[5m])",
+ "format": "time_series",
+ "interval": "",
+ "intervalFactor": 2,
+ "legendFormat": "CPU {{cpu}} - Dropped",
+ "refId": "B",
+ "step": 4
+ }
+ ],
+ "thresholds": [],
+ "timeFrom": null,
+ "timeRegions": [],
+ "timeShift": null,
+ "title": "Softnet Packets",
+ "tooltip": {
+ "shared": true,
+ "sort": 0,
+ "value_type": "individual"
+ },
+ "type": "graph",
+ "xaxis": {
+ "buckets": null,
+ "mode": "time",
+ "name": null,
+ "show": true,
+ "values": []
+ },
+ "yaxes": [
+ {
+ "$$hashKey": "object:207",
+ "format": "short",
+ "label": "packetes drop (-) / process (+)",
+ "logBase": 1,
+ "max": null,
+ "min": null,
+ "show": true
+ },
+ {
+ "$$hashKey": "object:208",
+ "format": "short",
+ "label": null,
+ "logBase": 1,
+ "max": null,
+ "min": null,
+ "show": false
+ }
+ ],
+ "yaxis": {
+ "align": false,
+ "alignLevel": null
+ }
+ },
+ {
+ "aliasColors": {},
+ "bars": false,
+ "dashLength": 10,
+ "dashes": false,
+ "datasource": "${DS_PROMETHEUS}",
+ "fill": 2,
+ "fillGradient": 0,
+ "gridPos": {
+ "h": 10,
+ "w": 12,
+ "x": 12,
+ "y": 100
+ },
+ "hiddenSeries": false,
+ "id": 310,
+ "legend": {
+ "alignAsTable": true,
+ "avg": true,
+ "current": true,
+ "hideEmpty": false,
+ "hideZero": false,
+ "max": true,
+ "min": true,
+ "rightSide": false,
+ "show": true,
+ "sideWidth": 300,
+ "sort": "current",
+ "sortDesc": true,
+ "total": false,
+ "values": true
+ },
+ "lines": true,
+ "linewidth": 1,
+ "links": [],
+ "nullPointMode": "null",
+ "options": {
+ "dataLinks": []
+ },
+ "percentage": false,
+ "pointradius": 5,
+ "points": false,
+ "renderer": "flot",
+ "seriesOverrides": [],
+ "spaceLength": 10,
+ "stack": false,
+ "steppedLine": false,
+ "targets": [
+ {
+ "expr": "irate(node_softnet_times_squeezed_total{instance=\"$node\",job=\"$job\"}[5m])",
+ "format": "time_series",
+ "interval": "",
+ "intervalFactor": 2,
+ "legendFormat": "CPU {{cpu}} - Squeezed",
+ "refId": "A",
+ "step": 4
+ }
+ ],
+ "thresholds": [],
+ "timeFrom": null,
+ "timeRegions": [],
+ "timeShift": null,
+ "title": "Softnet Out of Quota",
+ "tooltip": {
+ "shared": true,
+ "sort": 0,
+ "value_type": "individual"
+ },
+ "type": "graph",
+ "xaxis": {
+ "buckets": null,
+ "mode": "time",
+ "name": null,
+ "show": true,
+ "values": []
+ },
+ "yaxes": [
+ {
+ "$$hashKey": "object:207",
+ "format": "short",
+ "label": "counter",
+ "logBase": 1,
+ "max": null,
+ "min": null,
+ "show": true
+ },
+ {
+ "$$hashKey": "object:208",
+ "format": "short",
+ "label": null,
+ "logBase": 1,
+ "max": null,
+ "min": null,
+ "show": false
+ }
+ ],
+ "yaxis": {
+ "align": false,
+ "alignLevel": null
+ }
+ },
+ {
+ "aliasColors": {},
+ "bars": false,
+ "dashLength": 10,
+ "dashes": false,
+ "datasource": "${DS_PROMETHEUS}",
+ "fill": 2,
+ "fillGradient": 0,
+ "gridPos": {
+ "h": 10,
+ "w": 12,
+ "x": 0,
+ "y": 110
+ },
+ "hiddenSeries": false,
+ "id": 309,
+ "legend": {
+ "alignAsTable": true,
+ "avg": true,
+ "current": true,
+ "hideEmpty": false,
+ "hideZero": false,
+ "max": true,
+ "min": true,
+ "rightSide": false,
+ "show": true,
+ "sideWidth": 300,
+ "sort": "current",
+ "sortDesc": true,
+ "total": false,
+ "values": true
+ },
+ "lines": true,
+ "linewidth": 1,
+ "links": [],
+ "nullPointMode": "null",
+ "options": {
+ "dataLinks": []
+ },
+ "percentage": false,
+ "pointradius": 5,
+ "points": false,
+ "renderer": "flot",
+ "seriesOverrides": [],
+ "spaceLength": 10,
+ "stack": false,
+ "steppedLine": false,
+ "targets": [
+ {
+ "expr": "node_network_up{operstate=\"up\",instance=\"$node\",job=\"$job\"}",
+ "format": "time_series",
+ "intervalFactor": 2,
+ "legendFormat": "{{interface}} - Operational state UP",
+ "refId": "A",
+ "step": 4
+ },
+ {
+ "expr": "node_network_carrier{instance=\"$node\",job=\"$job\"}",
+ "format": "time_series",
+ "instant": false,
+ "legendFormat": "{{device}} - Physical link state",
+ "refId": "B"
+ }
+ ],
+ "thresholds": [],
+ "timeFrom": null,
+ "timeRegions": [],
+ "timeShift": null,
+ "title": "Network Operational Status",
+ "tooltip": {
+ "shared": true,
+ "sort": 0,
+ "value_type": "individual"
+ },
+ "type": "graph",
+ "xaxis": {
+ "buckets": null,
+ "mode": "time",
+ "name": null,
+ "show": true,
+ "values": []
+ },
+ "yaxes": [
+ {
+ "format": "short",
+ "label": "counter",
+ "logBase": 1,
+ "max": null,
+ "min": null,
+ "show": true
+ },
+ {
+ "format": "short",
+ "label": null,
+ "logBase": 1,
+ "max": null,
+ "min": null,
+ "show": false
+ }
+ ],
+ "yaxis": {
+ "align": false,
+ "alignLevel": null
+ }
+ }
+ ],
+ "repeat": null,
+ "title": "Network Traffic",
+ "type": "row"
+ },
+ {
+ "collapsed": true,
+ "datasource": "${DS_PROMETHEUS}",
+ "gridPos": {
+ "h": 1,
+ "w": 24,
+ "x": 0,
+ "y": 31
+ },
+ "id": 273,
+ "panels": [
+ {
+ "aliasColors": {},
+ "bars": false,
+ "dashLength": 10,
+ "dashes": false,
+ "datasource": "${DS_PROMETHEUS}",
+ "fill": 2,
+ "fillGradient": 0,
+ "gridPos": {
+ "h": 10,
+ "w": 12,
+ "x": 0,
+ "y": 13
+ },
+ "hiddenSeries": false,
+ "id": 63,
+ "legend": {
+ "alignAsTable": true,
+ "avg": true,
+ "current": true,
+ "hideEmpty": false,
+ "hideZero": false,
+ "max": true,
+ "min": true,
+ "rightSide": false,
+ "show": true,
+ "sideWidth": 300,
+ "total": false,
+ "values": true
+ },
+ "lines": true,
+ "linewidth": 1,
+ "links": [],
+ "nullPointMode": "null",
+ "options": {
+ "dataLinks": []
+ },
+ "percentage": false,
+ "pointradius": 5,
+ "points": false,
+ "renderer": "flot",
+ "seriesOverrides": [],
+ "spaceLength": 10,
+ "stack": false,
+ "steppedLine": false,
+ "targets": [
+ {
+ "expr": "node_sockstat_TCP_alloc{instance=\"$node\",job=\"$job\"}",
+ "format": "time_series",
+ "interval": "",
+ "intervalFactor": 2,
+ "legendFormat": "TCP_alloc - Allocated sockets",
+ "refId": "A",
+ "step": 240
+ },
+ {
+ "expr": "node_sockstat_TCP_inuse{instance=\"$node\",job=\"$job\"}",
+ "format": "time_series",
+ "interval": "",
+ "intervalFactor": 2,
+ "legendFormat": "TCP_inuse - Tcp sockets currently in use",
+ "refId": "B",
+ "step": 240
+ },
+ {
+ "expr": "node_sockstat_TCP_mem{instance=\"$node\",job=\"$job\"}",
+ "format": "time_series",
+ "hide": true,
+ "interval": "",
+ "intervalFactor": 2,
+ "legendFormat": "TCP_mem - Used memory for tcp",
+ "refId": "C",
+ "step": 240
+ },
+ {
+ "expr": "node_sockstat_TCP_orphan{instance=\"$node\",job=\"$job\"}",
+ "format": "time_series",
+ "interval": "",
+ "intervalFactor": 2,
+ "legendFormat": "TCP_orphan - Orphan sockets",
+ "refId": "D",
+ "step": 240
+ },
+ {
+ "expr": "node_sockstat_TCP_tw{instance=\"$node\",job=\"$job\"}",
+ "format": "time_series",
+ "interval": "",
+ "intervalFactor": 2,
+ "legendFormat": "TCP_tw - Sockets wating close",
+ "refId": "E",
+ "step": 240
+ }
+ ],
+ "thresholds": [],
+ "timeFrom": null,
+ "timeRegions": [],
+ "timeShift": null,
+ "title": "Sockstat TCP",
+ "tooltip": {
+ "shared": true,
+ "sort": 0,
+ "value_type": "individual"
+ },
+ "type": "graph",
+ "xaxis": {
+ "buckets": null,
+ "mode": "time",
+ "name": null,
+ "show": true,
+ "values": []
+ },
+ "yaxes": [
+ {
+ "format": "short",
+ "label": "counter",
+ "logBase": 1,
+ "max": null,
+ "min": "0",
+ "show": true
+ },
+ {
+ "format": "short",
+ "label": null,
+ "logBase": 1,
+ "max": null,
+ "min": null,
+ "show": false
+ }
+ ],
+ "yaxis": {
+ "align": false,
+ "alignLevel": null
+ }
+ },
+ {
+ "aliasColors": {},
+ "bars": false,
+ "dashLength": 10,
+ "dashes": false,
+ "datasource": "${DS_PROMETHEUS}",
+ "fill": 2,
+ "fillGradient": 0,
+ "gridPos": {
+ "h": 10,
+ "w": 12,
+ "x": 12,
+ "y": 13
+ },
+ "hiddenSeries": false,
+ "id": 124,
+ "legend": {
+ "alignAsTable": true,
+ "avg": true,
+ "current": true,
+ "hideEmpty": false,
+ "hideZero": false,
+ "max": true,
+ "min": true,
+ "rightSide": false,
+ "show": true,
+ "sideWidth": 300,
+ "total": false,
+ "values": true
+ },
+ "lines": true,
+ "linewidth": 1,
+ "links": [],
+ "nullPointMode": "null",
+ "options": {
+ "dataLinks": []
+ },
+ "percentage": false,
+ "pointradius": 5,
+ "points": false,
+ "renderer": "flot",
+ "seriesOverrides": [],
+ "spaceLength": 10,
+ "stack": false,
+ "steppedLine": false,
+ "targets": [
+ {
+ "expr": "node_sockstat_UDPLITE_inuse{instance=\"$node\",job=\"$job\"}",
+ "format": "time_series",
+ "interval": "",
+ "intervalFactor": 2,
+ "legendFormat": "UDPLITE_inuse - Udplite sockets currently in use",
+ "refId": "A",
+ "step": 240
+ },
+ {
+ "expr": "node_sockstat_UDP_inuse{instance=\"$node\",job=\"$job\"}",
+ "format": "time_series",
+ "interval": "",
+ "intervalFactor": 2,
+ "legendFormat": "UDP_inuse - Udp sockets currently in use",
+ "refId": "B",
+ "step": 240
+ },
+ {
+ "expr": "node_sockstat_UDP_mem{instance=\"$node\",job=\"$job\"}",
+ "format": "time_series",
+ "interval": "",
+ "intervalFactor": 2,
+ "legendFormat": "UDP_mem - Used memory for udp",
+ "refId": "C",
+ "step": 240
+ }
+ ],
+ "thresholds": [],
+ "timeFrom": null,
+ "timeRegions": [],
+ "timeShift": null,
+ "title": "Sockstat UDP",
+ "tooltip": {
+ "shared": true,
+ "sort": 0,
+ "value_type": "individual"
+ },
+ "type": "graph",
+ "xaxis": {
+ "buckets": null,
+ "mode": "time",
+ "name": null,
+ "show": true,
+ "values": []
+ },
+ "yaxes": [
+ {
+ "format": "short",
+ "label": "counter",
+ "logBase": 1,
+ "max": null,
+ "min": "0",
+ "show": true
+ },
+ {
+ "format": "short",
+ "label": null,
+ "logBase": 1,
+ "max": null,
+ "min": null,
+ "show": false
+ }
+ ],
+ "yaxis": {
+ "align": false,
+ "alignLevel": null
+ }
+ },
+ {
+ "aliasColors": {},
+ "bars": false,
+ "dashLength": 10,
+ "dashes": false,
+ "datasource": "${DS_PROMETHEUS}",
+ "fill": 2,
+ "fillGradient": 0,
+ "gridPos": {
+ "h": 10,
+ "w": 12,
+ "x": 0,
+ "y": 23
+ },
+ "hiddenSeries": false,
+ "id": 126,
+ "legend": {
+ "alignAsTable": true,
+ "avg": true,
+ "current": true,
+ "hideEmpty": false,
+ "hideZero": false,
+ "max": true,
+ "min": true,
+ "rightSide": false,
+ "show": true,
+ "sideWidth": 300,
+ "total": false,
+ "values": true
+ },
+ "lines": true,
+ "linewidth": 1,
+ "links": [],
+ "nullPointMode": "null",
+ "options": {
+ "dataLinks": []
+ },
+ "percentage": false,
+ "pointradius": 5,
+ "points": false,
+ "renderer": "flot",
+ "seriesOverrides": [],
+ "spaceLength": 10,
+ "stack": false,
+ "steppedLine": false,
+ "targets": [
+ {
+ "expr": "node_sockstat_sockets_used{instance=\"$node\",job=\"$job\"}",
+ "format": "time_series",
+ "interval": "",
+ "intervalFactor": 2,
+ "legendFormat": "Sockets_used - Sockets currently in use",
+ "refId": "A",
+ "step": 240
+ }
+ ],
+ "thresholds": [],
+ "timeFrom": null,
+ "timeRegions": [],
+ "timeShift": null,
+ "title": "Sockstat Used",
+ "tooltip": {
+ "shared": true,
+ "sort": 0,
+ "value_type": "individual"
+ },
+ "type": "graph",
+ "xaxis": {
+ "buckets": null,
+ "mode": "time",
+ "name": null,
+ "show": true,
+ "values": []
+ },
+ "yaxes": [
+ {
+ "format": "short",
+ "label": "sockets",
+ "logBase": 1,
+ "max": null,
+ "min": "0",
+ "show": true
+ },
+ {
+ "format": "short",
+ "label": null,
+ "logBase": 1,
+ "max": null,
+ "min": null,
+ "show": false
+ }
+ ],
+ "yaxis": {
+ "align": false,
+ "alignLevel": null
+ }
+ },
+ {
+ "aliasColors": {},
+ "bars": false,
+ "dashLength": 10,
+ "dashes": false,
+ "datasource": "${DS_PROMETHEUS}",
+ "fill": 2,
+ "fillGradient": 0,
+ "gridPos": {
+ "h": 10,
+ "w": 12,
+ "x": 12,
+ "y": 23
+ },
+ "hiddenSeries": false,
+ "id": 220,
+ "legend": {
+ "alignAsTable": true,
+ "avg": true,
+ "current": true,
+ "hideEmpty": false,
+ "hideZero": false,
+ "max": true,
+ "min": true,
+ "rightSide": false,
+ "show": true,
+ "sideWidth": 300,
+ "total": false,
+ "values": true
+ },
+ "lines": true,
+ "linewidth": 1,
+ "links": [],
+ "nullPointMode": "null",
+ "options": {
+ "dataLinks": []
+ },
+ "percentage": false,
+ "pointradius": 5,
+ "points": false,
+ "renderer": "flot",
+ "seriesOverrides": [],
+ "spaceLength": 10,
+ "stack": false,
+ "steppedLine": false,
+ "targets": [
+ {
+ "expr": "node_sockstat_TCP_mem_bytes{instance=\"$node\",job=\"$job\"}",
+ "format": "time_series",
+ "interval": "",
+ "intervalFactor": 2,
+ "legendFormat": "mem_bytes - TCP sockets in that state",
+ "refId": "A",
+ "step": 240
+ },
+ {
+ "expr": "node_sockstat_UDP_mem_bytes{instance=\"$node\",job=\"$job\"}",
+ "format": "time_series",
+ "interval": "",
+ "intervalFactor": 2,
+ "legendFormat": "mem_bytes - UDP sockets in that state",
+ "refId": "B",
+ "step": 240
+ }
+ ],
+ "thresholds": [],
+ "timeFrom": null,
+ "timeRegions": [],
+ "timeShift": null,
+ "title": "Sockstat Memory Size",
+ "tooltip": {
+ "shared": true,
+ "sort": 0,
+ "value_type": "individual"
+ },
+ "type": "graph",
+ "xaxis": {
+ "buckets": null,
+ "mode": "time",
+ "name": null,
+ "show": true,
+ "values": []
+ },
+ "yaxes": [
+ {
+ "format": "bytes",
+ "label": "bytes",
+ "logBase": 1,
+ "max": null,
+ "min": "0",
+ "show": true
+ },
+ {
+ "format": "short",
+ "label": null,
+ "logBase": 1,
+ "max": null,
+ "min": null,
+ "show": false
+ }
+ ],
+ "yaxis": {
+ "align": false,
+ "alignLevel": null
+ }
+ },
+ {
+ "aliasColors": {},
+ "bars": false,
+ "dashLength": 10,
+ "dashes": false,
+ "datasource": "${DS_PROMETHEUS}",
+ "fill": 2,
+ "fillGradient": 0,
+ "gridPos": {
+ "h": 10,
+ "w": 12,
+ "x": 0,
+ "y": 33
+ },
+ "hiddenSeries": false,
+ "id": 125,
+ "legend": {
+ "alignAsTable": true,
+ "avg": true,
+ "current": true,
+ "hideEmpty": false,
+ "hideZero": false,
+ "max": true,
+ "min": true,
+ "rightSide": false,
+ "show": true,
+ "sideWidth": 300,
+ "total": false,
+ "values": true
+ },
+ "lines": true,
+ "linewidth": 1,
+ "links": [],
+ "nullPointMode": "null",
+ "options": {
+ "dataLinks": []
+ },
+ "percentage": false,
+ "pointradius": 5,
+ "points": false,
+ "renderer": "flot",
+ "seriesOverrides": [],
+ "spaceLength": 10,
+ "stack": false,
+ "steppedLine": false,
+ "targets": [
+ {
+ "expr": "node_sockstat_FRAG_inuse{instance=\"$node\",job=\"$job\"}",
+ "format": "time_series",
+ "interval": "",
+ "intervalFactor": 2,
+ "legendFormat": "FRAG_inuse - Frag sockets currently in use",
+ "refId": "A",
+ "step": 240
+ },
+ {
+ "expr": "node_sockstat_FRAG_memory{instance=\"$node\",job=\"$job\"}",
+ "format": "time_series",
+ "interval": "",
+ "intervalFactor": 2,
+ "legendFormat": "FRAG_memory - Used memory for frag",
+ "refId": "B",
+ "step": 240
+ },
+ {
+ "expr": "node_sockstat_RAW_inuse{instance=\"$node\",job=\"$job\"}",
+ "format": "time_series",
+ "interval": "",
+ "intervalFactor": 2,
+ "legendFormat": "RAW_inuse - Raw sockets currently in use",
+ "refId": "C",
+ "step": 240
+ }
+ ],
+ "thresholds": [],
+ "timeFrom": null,
+ "timeRegions": [],
+ "timeShift": null,
+ "title": "Sockstat FRAG / RAW",
+ "tooltip": {
+ "shared": true,
+ "sort": 0,
+ "value_type": "individual"
+ },
+ "type": "graph",
+ "xaxis": {
+ "buckets": null,
+ "mode": "time",
+ "name": null,
+ "show": true,
+ "values": []
+ },
+ "yaxes": [
+ {
+ "$$hashKey": "object:1572",
+ "format": "short",
+ "label": "counter",
+ "logBase": 1,
+ "max": null,
+ "min": "0",
+ "show": true
+ },
+ {
+ "$$hashKey": "object:1573",
+ "format": "short",
+ "label": null,
+ "logBase": 1,
+ "max": null,
+ "min": null,
+ "show": false
+ }
+ ],
+ "yaxis": {
+ "align": false,
+ "alignLevel": null
+ }
+ }
+ ],
+ "repeat": null,
+ "title": "Network Sockstat",
+ "type": "row"
+ },
+ {
+ "collapsed": true,
+ "datasource": "${DS_PROMETHEUS}",
+ "gridPos": {
+ "h": 1,
+ "w": 24,
+ "x": 0,
+ "y": 32
+ },
+ "id": 274,
+ "panels": [
+ {
+ "aliasColors": {},
+ "bars": false,
+ "dashLength": 10,
+ "dashes": false,
+ "datasource": "${DS_PROMETHEUS}",
+ "fill": 2,
+ "fillGradient": 0,
+ "gridPos": {
+ "h": 10,
+ "w": 12,
+ "x": 0,
+ "y": 32
+ },
+ "height": "",
+ "hiddenSeries": false,
+ "id": 221,
+ "legend": {
+ "alignAsTable": true,
+ "avg": true,
+ "current": true,
+ "hideEmpty": false,
+ "hideZero": false,
+ "max": true,
+ "min": true,
+ "rightSide": false,
+ "show": true,
+ "sideWidth": 300,
+ "sort": "current",
+ "sortDesc": true,
+ "total": false,
+ "values": true
+ },
+ "lines": true,
+ "linewidth": 1,
+ "links": [],
+ "maxPerRow": 12,
+ "nullPointMode": "null",
+ "options": {
+ "dataLinks": []
+ },
+ "percentage": false,
+ "pointradius": 5,
+ "points": false,
+ "renderer": "flot",
+ "seriesOverrides": [
+ {
+ "$$hashKey": "object:1876",
+ "alias": "/.*Out.*/",
+ "transform": "negative-Y"
+ }
+ ],
+ "spaceLength": 10,
+ "stack": false,
+ "steppedLine": false,
+ "targets": [
+ {
+ "expr": "irate(node_netstat_IpExt_InOctets{instance=\"$node\",job=\"$job\"}[5m])",
+ "format": "time_series",
+ "interval": "",
+ "intervalFactor": 2,
+ "legendFormat": "InOctets - Received octets",
+ "refId": "A",
+ "step": 4
+ },
+ {
+ "expr": "irate(node_netstat_IpExt_OutOctets{instance=\"$node\",job=\"$job\"}[5m])",
+ "format": "time_series",
+ "intervalFactor": 2,
+ "legendFormat": "OutOctets - Sent octets",
+ "refId": "B",
+ "step": 4
+ }
+ ],
+ "thresholds": [],
+ "timeFrom": null,
+ "timeRegions": [],
+ "timeShift": null,
+ "title": "Netstat IP In / Out Octets",
+ "tooltip": {
+ "shared": true,
+ "sort": 0,
+ "value_type": "individual"
+ },
+ "type": "graph",
+ "xaxis": {
+ "buckets": null,
+ "mode": "time",
+ "name": null,
+ "show": true,
+ "values": []
+ },
+ "yaxes": [
+ {
+ "$$hashKey": "object:1889",
+ "format": "short",
+ "label": "octects out (-) / in (+)",
+ "logBase": 1,
+ "max": null,
+ "min": null,
+ "show": true
+ },
+ {
+ "$$hashKey": "object:1890",
+ "format": "short",
+ "label": null,
+ "logBase": 1,
+ "max": null,
+ "min": null,
+ "show": false
+ }
+ ],
+ "yaxis": {
+ "align": false,
+ "alignLevel": null
+ }
+ },
+ {
+ "aliasColors": {},
+ "bars": false,
+ "dashLength": 10,
+ "dashes": false,
+ "datasource": "${DS_PROMETHEUS}",
+ "fill": 2,
+ "fillGradient": 0,
+ "gridPos": {
+ "h": 10,
+ "w": 12,
+ "x": 12,
+ "y": 32
+ },
+ "height": "",
+ "hiddenSeries": false,
+ "id": 81,
+ "legend": {
+ "alignAsTable": true,
+ "avg": true,
+ "current": true,
+ "hideEmpty": false,
+ "hideZero": false,
+ "max": true,
+ "min": true,
+ "rightSide": false,
+ "show": true,
+ "sideWidth": 300,
+ "sort": "current",
+ "sortDesc": true,
+ "total": false,
+ "values": true
+ },
+ "lines": true,
+ "linewidth": 1,
+ "links": [],
+ "nullPointMode": "null",
+ "options": {
+ "dataLinks": []
+ },
+ "percentage": false,
+ "pointradius": 5,
+ "points": false,
+ "renderer": "flot",
+ "seriesOverrides": [],
+ "spaceLength": 10,
+ "stack": false,
+ "steppedLine": false,
+ "targets": [
+ {
+ "expr": "irate(node_netstat_Ip_Forwarding{instance=\"$node\",job=\"$job\"}[5m])",
+ "format": "time_series",
+ "interval": "",
+ "intervalFactor": 2,
+ "legendFormat": "Forwarding - IP forwarding",
+ "refId": "A",
+ "step": 4
+ }
+ ],
+ "thresholds": [],
+ "timeFrom": null,
+ "timeRegions": [],
+ "timeShift": null,
+ "title": "Netstat IP Forwarding",
+ "tooltip": {
+ "shared": true,
+ "sort": 0,
+ "value_type": "individual"
+ },
+ "type": "graph",
+ "xaxis": {
+ "buckets": null,
+ "mode": "time",
+ "name": null,
+ "show": true,
+ "values": []
+ },
+ "yaxes": [
+ {
+ "$$hashKey": "object:1957",
+ "format": "short",
+ "label": "datagrams",
+ "logBase": 1,
+ "max": null,
+ "min": "0",
+ "show": true
+ },
+ {
+ "$$hashKey": "object:1958",
+ "format": "short",
+ "label": null,
+ "logBase": 1,
+ "max": null,
+ "min": null,
+ "show": false
+ }
+ ],
+ "yaxis": {
+ "align": false,
+ "alignLevel": null
+ }
+ },
+ {
+ "aliasColors": {},
+ "bars": false,
+ "dashLength": 10,
+ "dashes": false,
+ "datasource": "${DS_PROMETHEUS}",
+ "decimals": null,
+ "fill": 2,
+ "fillGradient": 0,
+ "gridPos": {
+ "h": 10,
+ "w": 12,
+ "x": 0,
+ "y": 42
+ },
+ "height": "",
+ "hiddenSeries": false,
+ "id": 115,
+ "legend": {
+ "alignAsTable": true,
+ "avg": true,
+ "current": true,
+ "hideZero": false,
+ "max": true,
+ "min": true,
+ "rightSide": false,
+ "show": true,
+ "sort": "current",
+ "sortDesc": true,
+ "total": false,
+ "values": true
+ },
+ "lines": true,
+ "linewidth": 1,
+ "links": [],
+ "maxPerRow": 12,
+ "nullPointMode": "null",
+ "options": {
+ "dataLinks": []
+ },
+ "percentage": false,
+ "pointradius": 5,
+ "points": false,
+ "renderer": "flot",
+ "seriesOverrides": [
+ {
+ "alias": "/.*Out.*/",
+ "transform": "negative-Y"
+ }
+ ],
+ "spaceLength": 10,
+ "stack": false,
+ "steppedLine": false,
+ "targets": [
+ {
+ "expr": "irate(node_netstat_Icmp_InMsgs{instance=\"$node\",job=\"$job\"}[5m])",
+ "format": "time_series",
+ "interval": "",
+ "intervalFactor": 2,
+ "legendFormat": "InMsgs - Messages which the entity received. Note that this counter includes all those counted by icmpInErrors",
+ "refId": "A",
+ "step": 4
+ },
+ {
+ "expr": "irate(node_netstat_Icmp_OutMsgs{instance=\"$node\",job=\"$job\"}[5m])",
+ "format": "time_series",
+ "interval": "",
+ "intervalFactor": 2,
+ "legendFormat": "OutMsgs - Messages which this entity attempted to send. Note that this counter includes all those counted by icmpOutErrors",
+ "refId": "B",
+ "step": 4
+ }
+ ],
+ "thresholds": [],
+ "timeFrom": null,
+ "timeRegions": [],
+ "timeShift": null,
+ "title": "ICMP In / Out",
+ "tooltip": {
+ "shared": true,
+ "sort": 0,
+ "value_type": "individual"
+ },
+ "type": "graph",
+ "xaxis": {
+ "buckets": null,
+ "mode": "time",
+ "name": null,
+ "show": true,
+ "values": []
+ },
+ "yaxes": [
+ {
+ "format": "short",
+ "label": "messages out (-) / in (+)",
+ "logBase": 1,
+ "max": null,
+ "min": null,
+ "show": true
+ },
+ {
+ "format": "short",
+ "label": null,
+ "logBase": 1,
+ "max": null,
+ "min": null,
+ "show": false
+ }
+ ],
+ "yaxis": {
+ "align": false,
+ "alignLevel": null
+ }
+ },
+ {
+ "aliasColors": {},
+ "bars": false,
+ "dashLength": 10,
+ "dashes": false,
+ "datasource": "${DS_PROMETHEUS}",
+ "decimals": null,
+ "fill": 2,
+ "fillGradient": 0,
+ "gridPos": {
+ "h": 10,
+ "w": 12,
+ "x": 12,
+ "y": 42
+ },
+ "height": "",
+ "hiddenSeries": false,
+ "id": 50,
+ "legend": {
+ "alignAsTable": true,
+ "avg": true,
+ "current": true,
+ "hideZero": false,
+ "max": true,
+ "min": true,
+ "rightSide": false,
+ "show": true,
+ "sort": "current",
+ "sortDesc": true,
+ "total": false,
+ "values": true
+ },
+ "lines": true,
+ "linewidth": 1,
+ "links": [],
+ "maxPerRow": 12,
+ "nullPointMode": "null",
+ "options": {
+ "dataLinks": []
+ },
+ "percentage": false,
+ "pointradius": 5,
+ "points": false,
+ "renderer": "flot",
+ "seriesOverrides": [
+ {
+ "alias": "/.*Out.*/",
+ "transform": "negative-Y"
+ }
+ ],
+ "spaceLength": 10,
+ "stack": false,
+ "steppedLine": false,
+ "targets": [
+ {
+ "expr": "irate(node_netstat_Icmp_InErrors{instance=\"$node\",job=\"$job\"}[5m])",
+ "format": "time_series",
+ "interval": "",
+ "intervalFactor": 2,
+ "legendFormat": "InErrors - Messages which the entity received but determined as having ICMP-specific errors (bad ICMP checksums, bad length, etc.)",
+ "refId": "A",
+ "step": 4
+ }
+ ],
+ "thresholds": [],
+ "timeFrom": null,
+ "timeRegions": [],
+ "timeShift": null,
+ "title": "ICMP Errors",
+ "tooltip": {
+ "shared": true,
+ "sort": 0,
+ "value_type": "individual"
+ },
+ "type": "graph",
+ "xaxis": {
+ "buckets": null,
+ "mode": "time",
+ "name": null,
+ "show": true,
+ "values": []
+ },
+ "yaxes": [
+ {
+ "format": "short",
+ "label": "messages out (-) / in (+)",
+ "logBase": 1,
+ "max": null,
+ "min": null,
+ "show": true
+ },
+ {
+ "format": "short",
+ "label": null,
+ "logBase": 1,
+ "max": null,
+ "min": null,
+ "show": false
+ }
+ ],
+ "yaxis": {
+ "align": false,
+ "alignLevel": null
+ }
+ },
+ {
+ "aliasColors": {},
+ "bars": false,
+ "dashLength": 10,
+ "dashes": false,
+ "datasource": "${DS_PROMETHEUS}",
+ "decimals": null,
+ "fill": 2,
+ "fillGradient": 0,
+ "gridPos": {
+ "h": 10,
+ "w": 12,
+ "x": 0,
+ "y": 52
+ },
+ "height": "",
+ "hiddenSeries": false,
+ "id": 55,
+ "legend": {
+ "alignAsTable": true,
+ "avg": true,
+ "current": true,
+ "hideZero": false,
+ "max": true,
+ "min": true,
+ "rightSide": false,
+ "show": true,
+ "sort": "current",
+ "sortDesc": true,
+ "total": false,
+ "values": true
+ },
+ "lines": true,
+ "linewidth": 1,
+ "links": [],
+ "maxPerRow": 12,
+ "nullPointMode": "null",
+ "options": {
+ "dataLinks": []
+ },
+ "percentage": false,
+ "pointradius": 5,
+ "points": false,
+ "renderer": "flot",
+ "seriesOverrides": [
+ {
+ "alias": "/.*Out.*/",
+ "transform": "negative-Y"
+ },
+ {
+ "alias": "/.*Snd.*/",
+ "transform": "negative-Y"
+ }
+ ],
+ "spaceLength": 10,
+ "stack": false,
+ "steppedLine": false,
+ "targets": [
+ {
+ "expr": "irate(node_netstat_Udp_InDatagrams{instance=\"$node\",job=\"$job\"}[5m])",
+ "format": "time_series",
+ "interval": "",
+ "intervalFactor": 2,
+ "legendFormat": "InDatagrams - Datagrams received",
+ "refId": "A",
+ "step": 4
+ },
+ {
+ "expr": "irate(node_netstat_Udp_OutDatagrams{instance=\"$node\",job=\"$job\"}[5m])",
+ "format": "time_series",
+ "interval": "",
+ "intervalFactor": 2,
+ "legendFormat": "OutDatagrams - Datagrams sent",
+ "refId": "B",
+ "step": 4
+ }
+ ],
+ "thresholds": [],
+ "timeFrom": null,
+ "timeRegions": [],
+ "timeShift": null,
+ "title": "UDP In / Out",
+ "tooltip": {
+ "shared": true,
+ "sort": 0,
+ "value_type": "individual"
+ },
+ "type": "graph",
+ "xaxis": {
+ "buckets": null,
+ "mode": "time",
+ "name": null,
+ "show": true,
+ "values": []
+ },
+ "yaxes": [
+ {
+ "format": "short",
+ "label": "datagrams out (-) / in (+)",
+ "logBase": 1,
+ "max": null,
+ "min": null,
+ "show": true
+ },
+ {
+ "format": "short",
+ "label": null,
+ "logBase": 1,
+ "max": null,
+ "min": null,
+ "show": false
+ }
+ ],
+ "yaxis": {
+ "align": false,
+ "alignLevel": null
+ }
+ },
+ {
+ "aliasColors": {},
+ "bars": false,
+ "dashLength": 10,
+ "dashes": false,
+ "datasource": "${DS_PROMETHEUS}",
+ "fill": 2,
+ "fillGradient": 0,
+ "gridPos": {
+ "h": 10,
+ "w": 12,
+ "x": 12,
+ "y": 52
+ },
+ "height": "",
+ "hiddenSeries": false,
+ "id": 109,
+ "legend": {
+ "alignAsTable": true,
+ "avg": true,
+ "current": true,
+ "hideZero": false,
+ "max": true,
+ "min": true,
+ "rightSide": false,
+ "show": true,
+ "sort": "current",
+ "sortDesc": true,
+ "total": false,
+ "values": true
+ },
+ "lines": true,
+ "linewidth": 1,
+ "links": [],
+ "maxPerRow": 12,
+ "nullPointMode": "null",
+ "options": {
+ "dataLinks": []
+ },
+ "percentage": false,
+ "pointradius": 5,
+ "points": false,
+ "renderer": "flot",
+ "seriesOverrides": [],
+ "spaceLength": 10,
+ "stack": false,
+ "steppedLine": false,
+ "targets": [
+ {
+ "expr": "irate(node_netstat_Udp_InErrors{instance=\"$node\",job=\"$job\"}[5m])",
+ "format": "time_series",
+ "interval": "",
+ "intervalFactor": 2,
+ "legendFormat": "InErrors - UDP Datagrams that could not be delivered to an application",
+ "refId": "A",
+ "step": 4
+ },
+ {
+ "expr": "irate(node_netstat_Udp_NoPorts{instance=\"$node\",job=\"$job\"}[5m])",
+ "format": "time_series",
+ "interval": "",
+ "intervalFactor": 2,
+ "legendFormat": "NoPorts - UDP Datagrams received on a port with no listener",
+ "refId": "B",
+ "step": 4
+ },
+ {
+ "expr": "irate(node_netstat_UdpLite_InErrors{instance=\"$node\",job=\"$job\"}[5m])",
+ "interval": "",
+ "legendFormat": "InErrors Lite - UDPLite Datagrams that could not be delivered to an application",
+ "refId": "C"
+ },
+ {
+ "expr": "irate(node_netstat_Udp_RcvbufErrors{instance=\"$node\",job=\"$job\"}[5m])",
+ "format": "time_series",
+ "interval": "",
+ "intervalFactor": 2,
+ "legendFormat": "RcvbufErrors - UDP buffer errors received",
+ "refId": "D",
+ "step": 4
+ },
+ {
+ "expr": "irate(node_netstat_Udp_SndbufErrors{instance=\"$node\",job=\"$job\"}[5m])",
+ "format": "time_series",
+ "interval": "",
+ "intervalFactor": 2,
+ "legendFormat": "SndbufErrors - UDP buffer errors send",
+ "refId": "E",
+ "step": 4
+ }
+ ],
+ "thresholds": [],
+ "timeFrom": null,
+ "timeRegions": [],
+ "timeShift": null,
+ "title": "UDP Errors",
+ "tooltip": {
+ "shared": true,
+ "sort": 0,
+ "value_type": "individual"
+ },
+ "type": "graph",
+ "xaxis": {
+ "buckets": null,
+ "mode": "time",
+ "name": null,
+ "show": true,
+ "values": []
+ },
+ "yaxes": [
+ {
+ "$$hashKey": "object:4232",
+ "format": "short",
+ "label": "datagrams",
+ "logBase": 1,
+ "max": null,
+ "min": null,
+ "show": true
+ },
+ {
+ "$$hashKey": "object:4233",
+ "format": "short",
+ "label": null,
+ "logBase": 1,
+ "max": null,
+ "min": null,
+ "show": false
+ }
+ ],
+ "yaxis": {
+ "align": false,
+ "alignLevel": null
+ }
+ },
+ {
+ "aliasColors": {},
+ "bars": false,
+ "dashLength": 10,
+ "dashes": false,
+ "datasource": "${DS_PROMETHEUS}",
+ "decimals": null,
+ "fill": 2,
+ "fillGradient": 0,
+ "gridPos": {
+ "h": 10,
+ "w": 12,
+ "x": 0,
+ "y": 62
+ },
+ "height": "",
+ "hiddenSeries": false,
+ "id": 299,
+ "legend": {
+ "alignAsTable": true,
+ "avg": true,
+ "current": true,
+ "hideZero": false,
+ "max": true,
+ "min": true,
+ "rightSide": false,
+ "show": true,
+ "sort": "current",
+ "sortDesc": true,
+ "total": false,
+ "values": true
+ },
+ "lines": true,
+ "linewidth": 1,
+ "links": [],
+ "maxPerRow": 12,
+ "nullPointMode": "null",
+ "options": {
+ "dataLinks": []
+ },
+ "percentage": false,
+ "pointradius": 5,
+ "points": false,
+ "renderer": "flot",
+ "seriesOverrides": [
+ {
+ "alias": "/.*Out.*/",
+ "transform": "negative-Y"
+ },
+ {
+ "alias": "/.*Snd.*/",
+ "transform": "negative-Y"
+ }
+ ],
+ "spaceLength": 10,
+ "stack": false,
+ "steppedLine": false,
+ "targets": [
+ {
+ "expr": "irate(node_netstat_Tcp_InSegs{instance=\"$node\",job=\"$job\"}[5m])",
+ "format": "time_series",
+ "instant": false,
+ "interval": "",
+ "intervalFactor": 2,
+ "legendFormat": "InSegs - Segments received, including those received in error. This count includes segments received on currently established connections",
+ "refId": "A",
+ "step": 4
+ },
+ {
+ "expr": "irate(node_netstat_Tcp_OutSegs{instance=\"$node\",job=\"$job\"}[5m])",
+ "format": "time_series",
+ "interval": "",
+ "intervalFactor": 2,
+ "legendFormat": "OutSegs - Segments sent, including those on current connections but excluding those containing only retransmitted octets",
+ "refId": "B",
+ "step": 4
+ }
+ ],
+ "thresholds": [],
+ "timeFrom": null,
+ "timeRegions": [],
+ "timeShift": null,
+ "title": "TCP In / Out",
+ "tooltip": {
+ "shared": true,
+ "sort": 0,
+ "value_type": "individual"
+ },
+ "type": "graph",
+ "xaxis": {
+ "buckets": null,
+ "mode": "time",
+ "name": null,
+ "show": true,
+ "values": []
+ },
+ "yaxes": [
+ {
+ "format": "short",
+ "label": "datagrams out (-) / in (+)",
+ "logBase": 1,
+ "max": null,
+ "min": null,
+ "show": true
+ },
+ {
+ "format": "short",
+ "label": null,
+ "logBase": 1,
+ "max": null,
+ "min": null,
+ "show": false
+ }
+ ],
+ "yaxis": {
+ "align": false,
+ "alignLevel": null
+ }
+ },
+ {
+ "aliasColors": {},
+ "bars": false,
+ "dashLength": 10,
+ "dashes": false,
+ "datasource": "${DS_PROMETHEUS}",
+ "description": "",
+ "fill": 2,
+ "fillGradient": 0,
+ "gridPos": {
+ "h": 10,
+ "w": 12,
+ "x": 12,
+ "y": 62
+ },
+ "height": "",
+ "hiddenSeries": false,
+ "id": 104,
+ "legend": {
+ "alignAsTable": true,
+ "avg": true,
+ "current": true,
+ "hideEmpty": false,
+ "hideZero": false,
+ "max": true,
+ "min": true,
+ "rightSide": false,
+ "show": true,
+ "sort": "current",
+ "sortDesc": true,
+ "total": false,
+ "values": true
+ },
+ "lines": true,
+ "linewidth": 1,
+ "links": [],
+ "maxPerRow": 12,
+ "nullPointMode": "null",
+ "options": {
+ "dataLinks": []
+ },
+ "percentage": false,
+ "pointradius": 5,
+ "points": false,
+ "renderer": "flot",
+ "seriesOverrides": [],
+ "spaceLength": 10,
+ "stack": false,
+ "steppedLine": false,
+ "targets": [
+ {
+ "expr": "irate(node_netstat_TcpExt_ListenOverflows{instance=\"$node\",job=\"$job\"}[5m])",
+ "format": "time_series",
+ "hide": false,
+ "interval": "",
+ "intervalFactor": 2,
+ "legendFormat": "ListenOverflows - Times the listen queue of a socket overflowed",
+ "refId": "A",
+ "step": 4
+ },
+ {
+ "expr": "irate(node_netstat_TcpExt_ListenDrops{instance=\"$node\",job=\"$job\"}[5m])",
+ "format": "time_series",
+ "hide": false,
+ "interval": "",
+ "intervalFactor": 2,
+ "legendFormat": "ListenDrops - SYNs to LISTEN sockets ignored",
+ "refId": "B",
+ "step": 4
+ },
+ {
+ "expr": "irate(node_netstat_TcpExt_TCPSynRetrans{instance=\"$node\",job=\"$job\"}[5m])",
+ "format": "time_series",
+ "interval": "",
+ "intervalFactor": 2,
+ "legendFormat": "TCPSynRetrans - SYN-SYN/ACK retransmits to break down retransmissions in SYN, fast/timeout retransmits",
+ "refId": "C",
+ "step": 4
+ },
+ {
+ "expr": "irate(node_netstat_Tcp_RetransSegs{instance=\"$node\",job=\"$job\"}[5m])",
+ "interval": "",
+ "legendFormat": "RetransSegs - Segments retransmitted - that is, the number of TCP segments transmitted containing one or more previously transmitted octets",
+ "refId": "D"
+ },
+ {
+ "expr": "irate(node_netstat_Tcp_InErrs{instance=\"$node\",job=\"$job\"}[5m])",
+ "interval": "",
+ "legendFormat": "InErrs - Segments received in error (e.g., bad TCP checksums)",
+ "refId": "E"
+ }
+ ],
+ "thresholds": [],
+ "timeFrom": null,
+ "timeRegions": [],
+ "timeShift": null,
+ "title": "TCP Errors",
+ "tooltip": {
+ "shared": true,
+ "sort": 0,
+ "value_type": "individual"
+ },
+ "type": "graph",
+ "xaxis": {
+ "buckets": null,
+ "mode": "time",
+ "name": null,
+ "show": true,
+ "values": []
+ },
+ "yaxes": [
+ {
+ "format": "short",
+ "label": "counter",
+ "logBase": 1,
+ "max": null,
+ "min": "0",
+ "show": true
+ },
+ {
+ "format": "short",
+ "label": null,
+ "logBase": 1,
+ "max": null,
+ "min": null,
+ "show": false
+ }
+ ],
+ "yaxis": {
+ "align": false,
+ "alignLevel": null
+ }
+ },
+ {
+ "aliasColors": {},
+ "bars": false,
+ "dashLength": 10,
+ "dashes": false,
+ "datasource": "${DS_PROMETHEUS}",
+ "fill": 2,
+ "fillGradient": 0,
+ "gridPos": {
+ "h": 10,
+ "w": 12,
+ "x": 0,
+ "y": 72
+ },
+ "height": "",
+ "hiddenSeries": false,
+ "id": 85,
+ "legend": {
+ "alignAsTable": true,
+ "avg": true,
+ "current": true,
+ "hideZero": false,
+ "max": true,
+ "min": true,
+ "rightSide": false,
+ "show": true,
+ "sortDesc": true,
+ "total": false,
+ "values": true
+ },
+ "lines": true,
+ "linewidth": 1,
+ "links": [],
+ "maxPerRow": 12,
+ "nullPointMode": "null",
+ "options": {
+ "dataLinks": []
+ },
+ "percentage": false,
+ "pointradius": 5,
+ "points": false,
+ "renderer": "flot",
+ "seriesOverrides": [
+ {
+ "$$hashKey": "object:454",
+ "alias": "/.*MaxConn *./",
+ "color": "#890F02",
+ "fill": 0
+ }
+ ],
+ "spaceLength": 10,
+ "stack": false,
+ "steppedLine": false,
+ "targets": [
+ {
+ "expr": "node_netstat_Tcp_CurrEstab{instance=\"$node\",job=\"$job\"}",
+ "format": "time_series",
+ "hide": false,
+ "interval": "",
+ "intervalFactor": 2,
+ "legendFormat": "CurrEstab - TCP connections for which the current state is either ESTABLISHED or CLOSE- WAIT",
+ "refId": "A",
+ "step": 4
+ },
+ {
+ "expr": "node_netstat_Tcp_MaxConn{instance=\"$node\",job=\"$job\"}",
+ "format": "time_series",
+ "hide": false,
+ "interval": "",
+ "intervalFactor": 2,
+ "legendFormat": "MaxConn - Limit on the total number of TCP connections the entity can support (Dinamic is \"-1\")",
+ "refId": "B",
+ "step": 4
+ }
+ ],
+ "thresholds": [],
+ "timeFrom": null,
+ "timeRegions": [],
+ "timeShift": null,
+ "title": "TCP Connections",
+ "tooltip": {
+ "shared": true,
+ "sort": 0,
+ "value_type": "individual"
+ },
+ "type": "graph",
+ "xaxis": {
+ "buckets": null,
+ "mode": "time",
+ "name": null,
+ "show": true,
+ "values": []
+ },
+ "yaxes": [
+ {
+ "$$hashKey": "object:469",
+ "format": "short",
+ "label": "connections",
+ "logBase": 1,
+ "max": null,
+ "min": "0",
+ "show": true
+ },
+ {
+ "$$hashKey": "object:470",
+ "format": "short",
+ "label": null,
+ "logBase": 1,
+ "max": null,
+ "min": null,
+ "show": false
+ }
+ ],
+ "yaxis": {
+ "align": false,
+ "alignLevel": null
+ }
+ },
+ {
+ "aliasColors": {},
+ "bars": false,
+ "dashLength": 10,
+ "dashes": false,
+ "datasource": "${DS_PROMETHEUS}",
+ "description": "",
+ "fill": 2,
+ "fillGradient": 0,
+ "gridPos": {
+ "h": 10,
+ "w": 12,
+ "x": 12,
+ "y": 72
+ },
+ "height": "",
+ "hiddenSeries": false,
+ "id": 91,
+ "legend": {
+ "alignAsTable": true,
+ "avg": true,
+ "current": true,
+ "hideEmpty": false,
+ "hideZero": false,
+ "max": true,
+ "min": true,
+ "rightSide": false,
+ "show": true,
+ "sort": "current",
+ "sortDesc": true,
+ "total": false,
+ "values": true
+ },
+ "lines": true,
+ "linewidth": 1,
+ "links": [],
+ "maxPerRow": 12,
+ "nullPointMode": "null",
+ "options": {
+ "dataLinks": []
+ },
+ "percentage": false,
+ "pointradius": 5,
+ "points": false,
+ "renderer": "flot",
+ "seriesOverrides": [
+ {
+ "alias": "/.*Sent.*/",
+ "transform": "negative-Y"
+ }
+ ],
+ "spaceLength": 10,
+ "stack": false,
+ "steppedLine": false,
+ "targets": [
+ {
+ "expr": "irate(node_netstat_TcpExt_SyncookiesFailed{instance=\"$node\",job=\"$job\"}[5m])",
+ "format": "time_series",
+ "hide": false,
+ "interval": "",
+ "intervalFactor": 2,
+ "legendFormat": "SyncookiesFailed - Invalid SYN cookies received",
+ "refId": "A",
+ "step": 4
+ },
+ {
+ "expr": "irate(node_netstat_TcpExt_SyncookiesRecv{instance=\"$node\",job=\"$job\"}[5m])",
+ "format": "time_series",
+ "hide": false,
+ "interval": "",
+ "intervalFactor": 2,
+ "legendFormat": "SyncookiesRecv - SYN cookies received",
+ "refId": "B",
+ "step": 4
+ },
+ {
+ "expr": "irate(node_netstat_TcpExt_SyncookiesSent{instance=\"$node\",job=\"$job\"}[5m])",
+ "format": "time_series",
+ "hide": false,
+ "interval": "",
+ "intervalFactor": 2,
+ "legendFormat": "SyncookiesSent - SYN cookies sent",
+ "refId": "C",
+ "step": 4
+ }
+ ],
+ "thresholds": [],
+ "timeFrom": null,
+ "timeRegions": [],
+ "timeShift": null,
+ "title": "TCP SynCookie",
+ "tooltip": {
+ "shared": true,
+ "sort": 0,
+ "value_type": "individual"
+ },
+ "type": "graph",
+ "xaxis": {
+ "buckets": null,
+ "mode": "time",
+ "name": null,
+ "show": true,
+ "values": []
+ },
+ "yaxes": [
+ {
+ "format": "short",
+ "label": "counter out (-) / in (+)",
+ "logBase": 1,
+ "max": null,
+ "min": null,
+ "show": true
+ },
+ {
+ "format": "short",
+ "label": null,
+ "logBase": 1,
+ "max": null,
+ "min": null,
+ "show": false
+ }
+ ],
+ "yaxis": {
+ "align": false,
+ "alignLevel": null
+ }
+ },
+ {
+ "aliasColors": {},
+ "bars": false,
+ "dashLength": 10,
+ "dashes": false,
+ "datasource": "${DS_PROMETHEUS}",
+ "fill": 2,
+ "fillGradient": 0,
+ "gridPos": {
+ "h": 10,
+ "w": 12,
+ "x": 0,
+ "y": 82
+ },
+ "height": "",
+ "hiddenSeries": false,
+ "id": 82,
+ "legend": {
+ "alignAsTable": true,
+ "avg": true,
+ "current": true,
+ "hideZero": false,
+ "max": true,
+ "min": true,
+ "rightSide": false,
+ "show": true,
+ "sortDesc": true,
+ "total": false,
+ "values": true
+ },
+ "lines": true,
+ "linewidth": 1,
+ "links": [],
+ "maxPerRow": 12,
+ "nullPointMode": "null",
+ "options": {
+ "dataLinks": []
+ },
+ "percentage": false,
+ "pointradius": 5,
+ "points": false,
+ "renderer": "flot",
+ "seriesOverrides": [],
+ "spaceLength": 10,
+ "stack": false,
+ "steppedLine": false,
+ "targets": [
+ {
+ "expr": "irate(node_netstat_Tcp_ActiveOpens{instance=\"$node\",job=\"$job\"}[5m])",
+ "format": "time_series",
+ "interval": "",
+ "intervalFactor": 2,
+ "legendFormat": "ActiveOpens - TCP connections that have made a direct transition to the SYN-SENT state from the CLOSED state",
+ "refId": "A",
+ "step": 4
+ },
+ {
+ "expr": "irate(node_netstat_Tcp_PassiveOpens{instance=\"$node\",job=\"$job\"}[5m])",
+ "format": "time_series",
+ "interval": "",
+ "intervalFactor": 2,
+ "legendFormat": "PassiveOpens - TCP connections that have made a direct transition to the SYN-RCVD state from the LISTEN state",
+ "refId": "B",
+ "step": 4
+ }
+ ],
+ "thresholds": [],
+ "timeFrom": null,
+ "timeRegions": [],
+ "timeShift": null,
+ "title": "TCP Direct Transition",
+ "tooltip": {
+ "shared": true,
+ "sort": 0,
+ "value_type": "individual"
+ },
+ "type": "graph",
+ "xaxis": {
+ "buckets": null,
+ "mode": "time",
+ "name": null,
+ "show": true,
+ "values": []
+ },
+ "yaxes": [
+ {
+ "format": "short",
+ "label": "connections",
+ "logBase": 1,
+ "max": null,
+ "min": "0",
+ "show": true
+ },
+ {
+ "format": "short",
+ "label": null,
+ "logBase": 1,
+ "max": null,
+ "min": null,
+ "show": false
+ }
+ ],
+ "yaxis": {
+ "align": false,
+ "alignLevel": null
+ }
+ }
+ ],
+ "repeat": null,
+ "title": "Network Netstat",
+ "type": "row"
+ },
+ {
+ "collapsed": true,
+ "datasource": "${DS_PROMETHEUS}",
+ "gridPos": {
+ "h": 1,
+ "w": 24,
+ "x": 0,
+ "y": 33
+ },
+ "id": 279,
+ "panels": [
+ {
+ "aliasColors": {},
+ "bars": false,
+ "dashLength": 10,
+ "dashes": false,
+ "datasource": "${DS_PROMETHEUS}",
+ "description": "",
+ "fill": 2,
+ "fillGradient": 0,
+ "gridPos": {
+ "h": 10,
+ "w": 12,
+ "x": 0,
+ "y": 54
+ },
+ "hiddenSeries": false,
+ "id": 40,
+ "legend": {
+ "alignAsTable": true,
+ "avg": true,
+ "current": true,
+ "max": true,
+ "min": true,
+ "show": true,
+ "sort": "current",
+ "sortDesc": true,
+ "total": false,
+ "values": true
+ },
+ "lines": true,
+ "linewidth": 1,
+ "links": [],
+ "nullPointMode": "null",
+ "options": {
+ "dataLinks": []
+ },
+ "percentage": false,
+ "pointradius": 5,
+ "points": false,
+ "renderer": "flot",
+ "seriesOverrides": [],
+ "spaceLength": 10,
+ "stack": true,
+ "steppedLine": false,
+ "targets": [
+ {
+ "expr": "node_scrape_collector_duration_seconds{instance=\"$node\",job=\"$job\"}",
+ "format": "time_series",
+ "hide": false,
+ "interval": "",
+ "intervalFactor": 2,
+ "legendFormat": "{{collector}} - Scrape duration",
+ "refId": "A",
+ "step": 4
+ }
+ ],
+ "thresholds": [],
+ "timeFrom": null,
+ "timeRegions": [],
+ "timeShift": null,
+ "title": "Node Exporter Scrape Time",
+ "tooltip": {
+ "shared": true,
+ "sort": 0,
+ "value_type": "individual"
+ },
+ "type": "graph",
+ "xaxis": {
+ "buckets": null,
+ "mode": "time",
+ "name": null,
+ "show": true,
+ "values": []
+ },
+ "yaxes": [
+ {
+ "format": "s",
+ "label": "seconds",
+ "logBase": 1,
+ "max": null,
+ "min": null,
+ "show": true
+ },
+ {
+ "format": "short",
+ "label": null,
+ "logBase": 1,
+ "max": null,
+ "min": null,
+ "show": false
+ }
+ ],
+ "yaxis": {
+ "align": false,
+ "alignLevel": null
+ }
+ },
+ {
+ "aliasColors": {},
+ "bars": false,
+ "dashLength": 10,
+ "dashes": false,
+ "datasource": "${DS_PROMETHEUS}",
+ "description": "",
+ "fill": 2,
+ "fillGradient": 0,
+ "gridPos": {
+ "h": 10,
+ "w": 12,
+ "x": 12,
+ "y": 54
+ },
+ "hiddenSeries": false,
+ "id": 157,
+ "legend": {
+ "alignAsTable": true,
+ "avg": true,
+ "current": true,
+ "max": true,
+ "min": true,
+ "show": true,
+ "total": false,
+ "values": true
+ },
+ "lines": true,
+ "linewidth": 1,
+ "links": [],
+ "nullPointMode": "null",
+ "options": {
+ "dataLinks": []
+ },
+ "percentage": false,
+ "pointradius": 5,
+ "points": false,
+ "renderer": "flot",
+ "seriesOverrides": [
+ {
+ "$$hashKey": "object:1969",
+ "alias": "/.*error.*/",
+ "color": "#F2495C",
+ "transform": "negative-Y"
+ }
+ ],
+ "spaceLength": 10,
+ "stack": true,
+ "steppedLine": false,
+ "targets": [
+ {
+ "expr": "node_scrape_collector_success{instance=\"$node\",job=\"$job\"}",
+ "format": "time_series",
+ "hide": false,
+ "interval": "",
+ "intervalFactor": 2,
+ "legendFormat": "{{collector}} - Scrape success",
+ "refId": "A",
+ "step": 4
+ },
+ {
+ "expr": "node_textfile_scrape_error{instance=\"$node\",job=\"$job\"}",
+ "format": "time_series",
+ "hide": false,
+ "interval": "",
+ "intervalFactor": 2,
+ "legendFormat": "{{collector}} - Scrape textfile error (1 = true)",
+ "refId": "B",
+ "step": 4
+ }
+ ],
+ "thresholds": [],
+ "timeFrom": null,
+ "timeRegions": [],
+ "timeShift": null,
+ "title": "Node Exporter Scrape",
+ "tooltip": {
+ "shared": true,
+ "sort": 0,
+ "value_type": "individual"
+ },
+ "type": "graph",
+ "xaxis": {
+ "buckets": null,
+ "mode": "time",
+ "name": null,
+ "show": true,
+ "values": []
+ },
+ "yaxes": [
+ {
+ "$$hashKey": "object:1484",
+ "format": "short",
+ "label": "counter",
+ "logBase": 1,
+ "max": null,
+ "min": null,
+ "show": true
+ },
+ {
+ "$$hashKey": "object:1485",
+ "format": "short",
+ "label": null,
+ "logBase": 1,
+ "max": null,
+ "min": null,
+ "show": false
+ }
+ ],
+ "yaxis": {
+ "align": false,
+ "alignLevel": null
+ }
+ }
+ ],
+ "repeat": null,
+ "title": "Node Exporter",
+ "type": "row"
+ }
+ ],
+ "refresh": "1m",
+ "schemaVersion": 22,
+ "style": "dark",
+ "tags": [
+ "linux"
+ ],
+ "templating": {
+ "list": [
+ {
+ "current": {
+ "selected": false,
+ "text": "default",
+ "value": "default"
+ },
+ "hide": 0,
+ "includeAll": false,
+ "label": "datasource",
+ "multi": false,
+ "name": "DS_PROMETHEUS",
+ "options": [],
+ "query": "prometheus",
+ "refresh": 1,
+ "regex": "",
+ "skipUrlSync": false,
+ "type": "datasource"
+ },
+ {
+ "allValue": null,
+ "current": {},
+ "datasource": "${DS_PROMETHEUS}",
+ "definition": "",
+ "hide": 0,
+ "includeAll": false,
+ "index": -1,
+ "label": "Job",
+ "multi": false,
+ "name": "job",
+ "options": [],
+ "query": "label_values(node_uname_info, job)",
+ "refresh": 1,
+ "regex": "",
+ "skipUrlSync": false,
+ "sort": 1,
+ "tagValuesQuery": "",
+ "tags": [],
+ "tagsQuery": "",
+ "type": "query",
+ "useTags": false
+ },
+ {
+ "allValue": null,
+ "current": {},
+ "datasource": "${DS_PROMETHEUS}",
+ "definition": "label_values(node_uname_info{job=\"$job\"}, instance)",
+ "hide": 0,
+ "includeAll": false,
+ "index": -1,
+ "label": "Host:",
+ "multi": false,
+ "name": "node",
+ "options": [],
+ "query": "label_values(node_uname_info{job=\"$job\"}, instance)",
+ "refresh": 1,
+ "regex": "",
+ "skipUrlSync": false,
+ "sort": 1,
+ "tagValuesQuery": "",
+ "tags": [],
+ "tagsQuery": "",
+ "type": "query",
+ "useTags": false
+ },
+ {
+ "allValue": null,
+ "current": {
+ "selected": false,
+ "text": "[a-z]+|nvme[0-9]+n[0-9]+",
+ "value": "[a-z]+|nvme[0-9]+n[0-9]+"
+ },
+ "hide": 2,
+ "includeAll": false,
+ "label": null,
+ "multi": false,
+ "name": "diskdevices",
+ "options": [
+ {
+ "selected": true,
+ "text": "[a-z]+|nvme[0-9]+n[0-9]+",
+ "value": "[a-z]+|nvme[0-9]+n[0-9]+"
+ }
+ ],
+ "query": "[a-z]+|nvme[0-9]+n[0-9]+",
+ "skipUrlSync": false,
+ "type": "custom"
+ }
+ ]
+ },
+ "time": {
+ "from": "now-4h",
+ "to": "now"
+ },
+ "timepicker": {
+ "refresh_intervals": [
+ "5s",
+ "10s",
+ "30s",
+ "1m",
+ "5m",
+ "15m",
+ "30m",
+ "1h",
+ "2h",
+ "1d"
+ ],
+ "time_options": [
+ "5m",
+ "15m",
+ "1h",
+ "6h",
+ "12h",
+ "24h",
+ "2d",
+ "7d",
+ "30d"
+ ]
+ },
+ "timezone": "browser",
+ "title": "Node Exporter",
+ "version": 1
+} \ No newline at end of file
diff --git a/terraform-ci-infra/1n_nmd/grafana/conf/nomad.json b/terraform-ci-infra/1n_nmd/grafana/conf/nomad.json
new file mode 100644
index 0000000000..40ffeddf7b
--- /dev/null
+++ b/terraform-ci-infra/1n_nmd/grafana/conf/nomad.json
@@ -0,0 +1,869 @@
+{
+ "__inputs": [
+ {
+ "name": "DS_PROMETHEUS",
+ "label": "Prometheus",
+ "description": "",
+ "type": "datasource",
+ "pluginId": "prometheus",
+ "pluginName": "Prometheus"
+ }
+ ],
+ "__requires": [
+ {
+ "type": "grafana",
+ "id": "grafana",
+ "name": "Grafana",
+ "version": "5.3.2"
+ },
+ {
+ "type": "panel",
+ "id": "graph",
+ "name": "Graph",
+ "version": "5.0.0"
+ },
+ {
+ "type": "datasource",
+ "id": "prometheus",
+ "name": "Prometheus",
+ "version": "5.0.0"
+ },
+ {
+ "type": "panel",
+ "id": "singlestat",
+ "name": "Singlestat",
+ "version": "5.0.0"
+ }
+ ],
+ "annotations": {
+ "list": [
+ {
+ "builtIn": 1,
+ "datasource": "-- Grafana --",
+ "enable": true,
+ "hide": true,
+ "iconColor": "rgba(0, 211, 255, 1)",
+ "name": "Annotations & Alerts",
+ "type": "dashboard"
+ }
+ ]
+ },
+ "description": "Nomad Jobs metrics",
+ "editable": true,
+ "gnetId": 12787,
+ "graphTooltip": 0,
+ "id": null,
+ "iteration": 1596708119930,
+ "links": [],
+ "panels": [
+ {
+ "cacheTimeout": null,
+ "colorBackground": false,
+ "colorValue": false,
+ "colors": [
+ "#299c46",
+ "rgba(237, 129, 40, 0.89)",
+ "#d44a3a"
+ ],
+ "datasource": "${DS_PROMETHEUS}",
+ "decimals": null,
+ "format": "dtdurations",
+ "gauge": {
+ "maxValue": 100,
+ "minValue": 0,
+ "show": false,
+ "thresholdLabels": false,
+ "thresholdMarkers": true
+ },
+ "gridPos": {
+ "h": 4,
+ "w": 4,
+ "x": 0,
+ "y": 0
+ },
+ "id": 16,
+ "interval": null,
+ "links": [],
+ "mappingType": 1,
+ "mappingTypes": [
+ {
+ "name": "value to text",
+ "value": 1
+ },
+ {
+ "name": "range to text",
+ "value": 2
+ }
+ ],
+ "maxDataPoints": 100,
+ "nullPointMode": "connected",
+ "nullText": null,
+ "postfix": "",
+ "postfixFontSize": "50%",
+ "prefix": "",
+ "prefixFontSize": "50%",
+ "rangeMaps": [
+ {
+ "from": "null",
+ "text": "N/A",
+ "to": "null"
+ }
+ ],
+ "sparkline": {
+ "fillColor": "rgba(31, 118, 189, 0.18)",
+ "full": false,
+ "lineColor": "rgb(31, 120, 193)",
+ "show": false
+ },
+ "tableColumn": "",
+ "targets": [
+ {
+ "alias": "",
+ "expr": "max(nomad_client_uptime{instance=~\"$instance\"})",
+ "format": "time_series",
+ "intervalFactor": 1,
+ "legendFormat": "",
+ "rawSql": "SELECT\n UNIX_TIMESTAMP(<time_column>) as time_sec,\n <value column> as value,\n <series name column> as metric\nFROM <table name>\nWHERE $__timeFilter(time_column)\nORDER BY <time_column> ASC\n",
+ "refId": "A"
+ }
+ ],
+ "thresholds": "",
+ "title": "Uptime",
+ "type": "singlestat",
+ "valueFontSize": "80%",
+ "valueMaps": [
+ {
+ "op": "=",
+ "text": "N/A",
+ "value": "null"
+ }
+ ],
+ "valueName": "current"
+ },
+ {
+ "cacheTimeout": null,
+ "colorBackground": false,
+ "colorPrefix": false,
+ "colorValue": false,
+ "colors": [
+ "#7eb26d",
+ "rgba(237, 129, 40, 0.89)",
+ "#d44a3a"
+ ],
+ "datasource": "${DS_PROMETHEUS}",
+ "format": "none",
+ "gauge": {
+ "maxValue": 100,
+ "minValue": 0,
+ "show": false,
+ "thresholdLabels": false,
+ "thresholdMarkers": true
+ },
+ "gridPos": {
+ "h": 4,
+ "w": 4,
+ "x": 4,
+ "y": 0
+ },
+ "id": 17,
+ "interval": null,
+ "links": [],
+ "mappingType": 1,
+ "mappingTypes": [
+ {
+ "name": "value to text",
+ "value": 1
+ },
+ {
+ "name": "range to text",
+ "value": 2
+ }
+ ],
+ "maxDataPoints": 100,
+ "nullPointMode": "connected",
+ "nullText": null,
+ "postfix": "",
+ "postfixFontSize": "50%",
+ "prefix": "",
+ "prefixFontSize": "50%",
+ "rangeMaps": [
+ {
+ "from": "null",
+ "text": "N/A",
+ "to": "null"
+ }
+ ],
+ "sparkline": {
+ "fillColor": "rgba(31, 118, 189, 0.18)",
+ "full": false,
+ "lineColor": "rgb(31, 120, 193)",
+ "show": true
+ },
+ "tableColumn": "",
+ "targets": [
+ {
+ "alias": "",
+ "expr": "count(sum(nomad_client_allocs_memory_cache) by (exported_job))",
+ "format": "time_series",
+ "intervalFactor": 1,
+ "legendFormat": "",
+ "rawSql": "SELECT\n UNIX_TIMESTAMP(<time_column>) as time_sec,\n <value column> as value,\n <series name column> as metric\nFROM <table name>\nWHERE $__timeFilter(time_column)\nORDER BY <time_column> ASC\n",
+ "refId": "A"
+ }
+ ],
+ "thresholds": "",
+ "title": "Jobs",
+ "type": "singlestat",
+ "valueFontSize": "80%",
+ "valueMaps": [
+ {
+ "op": "=",
+ "text": "N/A",
+ "value": "null"
+ }
+ ],
+ "valueName": "current"
+ },
+ {
+ "cacheTimeout": null,
+ "colorBackground": false,
+ "colorPrefix": false,
+ "colorValue": false,
+ "colors": [
+ "#7eb26d",
+ "rgba(237, 129, 40, 0.89)",
+ "#d44a3a"
+ ],
+ "datasource": "${DS_PROMETHEUS}",
+ "format": "none",
+ "gauge": {
+ "maxValue": 100,
+ "minValue": 0,
+ "show": false,
+ "thresholdLabels": false,
+ "thresholdMarkers": true
+ },
+ "gridPos": {
+ "h": 4,
+ "w": 4,
+ "x": 8,
+ "y": 0
+ },
+ "id": 12,
+ "interval": null,
+ "links": [],
+ "mappingType": 1,
+ "mappingTypes": [
+ {
+ "name": "value to text",
+ "value": 1
+ },
+ {
+ "name": "range to text",
+ "value": 2
+ }
+ ],
+ "maxDataPoints": 100,
+ "nullPointMode": "connected",
+ "nullText": null,
+ "postfix": "",
+ "postfixFontSize": "50%",
+ "prefix": "",
+ "prefixFontSize": "50%",
+ "rangeMaps": [
+ {
+ "from": "null",
+ "text": "N/A",
+ "to": "null"
+ }
+ ],
+ "sparkline": {
+ "fillColor": "rgba(31, 118, 189, 0.18)",
+ "full": false,
+ "lineColor": "rgb(31, 120, 193)",
+ "show": true
+ },
+ "tableColumn": "",
+ "targets": [
+ {
+ "alias": "",
+ "expr": "sum(nomad_client_allocations_running{datacenter=\"$datacenter\",instance=~\"$instance\"})",
+ "format": "time_series",
+ "intervalFactor": 1,
+ "legendFormat": "",
+ "rawSql": "SELECT\n UNIX_TIMESTAMP(<time_column>) as time_sec,\n <value column> as value,\n <series name column> as metric\nFROM <table name>\nWHERE $__timeFilter(time_column)\nORDER BY <time_column> ASC\n",
+ "refId": "A"
+ }
+ ],
+ "thresholds": "",
+ "title": "Allocs",
+ "type": "singlestat",
+ "valueFontSize": "80%",
+ "valueMaps": [
+ {
+ "op": "=",
+ "text": "N/A",
+ "value": "null"
+ }
+ ],
+ "valueName": "current"
+ },
+ {
+ "aliasColors": {},
+ "bars": false,
+ "dashLength": 10,
+ "dashes": false,
+ "datasource": "${DS_PROMETHEUS}",
+ "fill": 1,
+ "gridPos": {
+ "h": 4,
+ "w": 12,
+ "x": 12,
+ "y": 0
+ },
+ "id": 14,
+ "legend": {
+ "alignAsTable": true,
+ "avg": false,
+ "current": true,
+ "max": false,
+ "min": false,
+ "rightSide": true,
+ "show": true,
+ "total": false,
+ "values": true
+ },
+ "lines": true,
+ "linewidth": 1,
+ "links": [],
+ "nullPointMode": "null",
+ "percentage": false,
+ "pointradius": 5,
+ "points": false,
+ "renderer": "flot",
+ "seriesOverrides": [],
+ "spaceLength": 10,
+ "stack": false,
+ "steppedLine": false,
+ "targets": [
+ {
+ "alias": "",
+ "expr": "sum(nomad_client_allocations_blocked{datacenter=\"$datacenter\",instance=~\"$instance\"})",
+ "format": "time_series",
+ "hide": false,
+ "intervalFactor": 1,
+ "legendFormat": "Blocked",
+ "rawSql": "SELECT\n UNIX_TIMESTAMP(<time_column>) as time_sec,\n <value column> as value,\n <series name column> as metric\nFROM <table name>\nWHERE $__timeFilter(time_column)\nORDER BY <time_column> ASC\n",
+ "refId": "A"
+ },
+ {
+ "expr": "sum(nomad_client_allocations_pending{datacenter=\"$datacenter\",instance=~\"$instance\"})",
+ "format": "time_series",
+ "hide": false,
+ "intervalFactor": 1,
+ "legendFormat": "Pending",
+ "refId": "B"
+ },
+ {
+ "expr": "sum(nomad_client_allocations_restart{datacenter=\"$datacenter\",instance=~\"$instance\"})",
+ "format": "time_series",
+ "hide": false,
+ "intervalFactor": 1,
+ "legendFormat": "Restart ",
+ "refId": "C"
+ }
+ ],
+ "thresholds": [],
+ "timeFrom": null,
+ "timeShift": null,
+ "title": "Block/Peding/Restart",
+ "tooltip": {
+ "shared": true,
+ "sort": 0,
+ "value_type": "individual"
+ },
+ "type": "graph",
+ "xaxis": {
+ "buckets": null,
+ "mode": "time",
+ "name": null,
+ "show": true,
+ "values": []
+ },
+ "yaxes": [
+ {
+ "format": "short",
+ "label": null,
+ "logBase": 1,
+ "max": null,
+ "min": null,
+ "show": true
+ },
+ {
+ "format": "short",
+ "label": null,
+ "logBase": 1,
+ "max": null,
+ "min": null,
+ "show": true
+ }
+ ],
+ "yaxis": {
+ "align": false,
+ "alignLevel": null
+ }
+ },
+ {
+ "aliasColors": {},
+ "bars": false,
+ "dashLength": 10,
+ "dashes": false,
+ "datasource": "${DS_PROMETHEUS}",
+ "fill": 1,
+ "gridPos": {
+ "h": 6,
+ "w": 12,
+ "x": 0,
+ "y": 13
+ },
+ "id": 2,
+ "legend": {
+ "avg": false,
+ "current": false,
+ "max": false,
+ "min": false,
+ "show": true,
+ "total": false,
+ "values": false
+ },
+ "lines": true,
+ "linewidth": 1,
+ "links": [],
+ "nullPointMode": "null",
+ "percentage": false,
+ "pointradius": 5,
+ "points": false,
+ "renderer": "flot",
+ "repeat": "host",
+ "repeatDirection": "v",
+ "seriesOverrides": [],
+ "spaceLength": 10,
+ "stack": false,
+ "steppedLine": false,
+ "targets": [
+ {
+ "expr": "rate(nomad_client_allocs_cpu_total_percent[5m:10s]) > 1",
+ "format": "time_series",
+ "interval": "",
+ "intervalFactor": 1,
+ "legendFormat": "{{task}}",
+ "refId": "A"
+ }
+ ],
+ "thresholds": [],
+ "timeFrom": null,
+ "timeShift": null,
+ "title": "CPU Usage Percent",
+ "tooltip": {
+ "shared": true,
+ "sort": 0,
+ "value_type": "individual"
+ },
+ "type": "graph",
+ "xaxis": {
+ "buckets": null,
+ "mode": "time",
+ "name": null,
+ "show": true,
+ "values": []
+ },
+ "yaxes": [
+ {
+ "decimals": 3,
+ "format": "percentunit",
+ "label": null,
+ "logBase": 1,
+ "max": null,
+ "min": null,
+ "show": true
+ },
+ {
+ "format": "short",
+ "label": null,
+ "logBase": 1,
+ "max": null,
+ "min": null,
+ "show": true
+ }
+ ],
+ "yaxis": {
+ "align": false,
+ "alignLevel": null
+ }
+ },
+ {
+ "aliasColors": {},
+ "bars": false,
+ "dashLength": 10,
+ "dashes": false,
+ "datasource": "${DS_PROMETHEUS}",
+ "fill": 1,
+ "gridPos": {
+ "h": 6,
+ "w": 12,
+ "x": 12,
+ "y": 13
+ },
+ "id": 3,
+ "legend": {
+ "avg": false,
+ "current": false,
+ "max": false,
+ "min": false,
+ "show": true,
+ "total": false,
+ "values": false
+ },
+ "lines": true,
+ "linewidth": 1,
+ "links": [],
+ "nullPointMode": "null",
+ "percentage": false,
+ "pointradius": 5,
+ "points": false,
+ "renderer": "flot",
+ "repeat": "host",
+ "repeatDirection": "v",
+ "seriesOverrides": [],
+ "spaceLength": 10,
+ "stack": false,
+ "steppedLine": false,
+ "targets": [
+ {
+ "expr": "avg(nomad_client_allocs_cpu_total_ticks{instance=~\"$instance\"}) by(exported_job, task)",
+ "format": "time_series",
+ "interval": "",
+ "intervalFactor": 1,
+ "legendFormat": "{{task}}",
+ "refId": "A"
+ }
+ ],
+ "thresholds": [],
+ "timeFrom": null,
+ "timeShift": null,
+ "title": "CPU Total Ticks",
+ "tooltip": {
+ "shared": true,
+ "sort": 0,
+ "value_type": "individual"
+ },
+ "type": "graph",
+ "xaxis": {
+ "buckets": null,
+ "mode": "time",
+ "name": null,
+ "show": true,
+ "values": []
+ },
+ "yaxes": [
+ {
+ "decimals": 3,
+ "format": "timeticks",
+ "label": null,
+ "logBase": 1,
+ "max": null,
+ "min": null,
+ "show": true
+ },
+ {
+ "format": "short",
+ "label": null,
+ "logBase": 1,
+ "max": null,
+ "min": null,
+ "show": true
+ }
+ ],
+ "yaxis": {
+ "align": false,
+ "alignLevel": null
+ }
+ },
+ {
+ "aliasColors": {},
+ "bars": false,
+ "dashLength": 10,
+ "dashes": false,
+ "datasource": "${DS_PROMETHEUS}",
+ "fill": 1,
+ "gridPos": {
+ "h": 6,
+ "w": 12,
+ "x": 0,
+ "y": 19
+ },
+ "id": 6,
+ "legend": {
+ "avg": false,
+ "current": false,
+ "max": false,
+ "min": false,
+ "show": true,
+ "total": false,
+ "values": false
+ },
+ "lines": true,
+ "linewidth": 1,
+ "links": [],
+ "nullPointMode": "null",
+ "percentage": false,
+ "pointradius": 5,
+ "points": false,
+ "renderer": "flot",
+ "repeat": "host",
+ "repeatDirection": "v",
+ "seriesOverrides": [],
+ "spaceLength": 10,
+ "stack": false,
+ "steppedLine": false,
+ "targets": [
+ {
+ "expr": "avg(nomad_client_allocs_memory_rss{instance=~\"$instance\"}) by(exported_job, task)",
+ "format": "time_series",
+ "interval": "",
+ "intervalFactor": 1,
+ "legendFormat": "{{task}}",
+ "refId": "A"
+ }
+ ],
+ "thresholds": [],
+ "timeFrom": null,
+ "timeShift": null,
+ "title": "RSS",
+ "tooltip": {
+ "shared": false,
+ "sort": 0,
+ "value_type": "individual"
+ },
+ "type": "graph",
+ "xaxis": {
+ "buckets": null,
+ "mode": "time",
+ "name": null,
+ "show": true,
+ "values": []
+ },
+ "yaxes": [
+ {
+ "decimals": 3,
+ "format": "decbytes",
+ "label": null,
+ "logBase": 1,
+ "max": null,
+ "min": null,
+ "show": true
+ },
+ {
+ "format": "short",
+ "label": null,
+ "logBase": 1,
+ "max": null,
+ "min": null,
+ "show": true
+ }
+ ],
+ "yaxis": {
+ "align": false,
+ "alignLevel": null
+ }
+ },
+ {
+ "aliasColors": {},
+ "bars": false,
+ "dashLength": 10,
+ "dashes": false,
+ "datasource": "${DS_PROMETHEUS}",
+ "fill": 1,
+ "gridPos": {
+ "h": 6,
+ "w": 12,
+ "x": 12,
+ "y": 19
+ },
+ "id": 7,
+ "legend": {
+ "avg": false,
+ "current": false,
+ "max": false,
+ "min": false,
+ "show": true,
+ "total": false,
+ "values": false
+ },
+ "lines": true,
+ "linewidth": 1,
+ "links": [],
+ "nullPointMode": "null",
+ "percentage": false,
+ "pointradius": 5,
+ "points": false,
+ "renderer": "flot",
+ "repeat": "host",
+ "repeatDirection": "v",
+ "seriesOverrides": [],
+ "spaceLength": 10,
+ "stack": false,
+ "steppedLine": false,
+ "targets": [
+ {
+ "expr": "avg(nomad_client_allocs_memory_cache{instance=~\"$instance\"}) by(exported_job, task)",
+ "format": "time_series",
+ "interval": "",
+ "intervalFactor": 1,
+ "legendFormat": "{{task}}",
+ "refId": "A"
+ }
+ ],
+ "thresholds": [],
+ "timeFrom": null,
+ "timeShift": null,
+ "title": "Memory Cache",
+ "tooltip": {
+ "shared": true,
+ "sort": 0,
+ "value_type": "individual"
+ },
+ "type": "graph",
+ "xaxis": {
+ "buckets": null,
+ "mode": "time",
+ "name": null,
+ "show": true,
+ "values": []
+ },
+ "yaxes": [
+ {
+ "decimals": 3,
+ "format": "decbytes",
+ "label": null,
+ "logBase": 1,
+ "max": null,
+ "min": null,
+ "show": true
+ },
+ {
+ "format": "short",
+ "label": null,
+ "logBase": 1,
+ "max": null,
+ "min": null,
+ "show": true
+ }
+ ],
+ "yaxis": {
+ "align": false,
+ "alignLevel": null
+ }
+ }
+ ],
+ "refresh": false,
+ "schemaVersion": 16,
+ "style": "dark",
+ "tags": [],
+ "templating": {
+ "list": [
+ {
+ "current": {
+ "selected": false,
+ "text": "default",
+ "value": "default"
+ },
+ "hide": 0,
+ "includeAll": false,
+ "label": "datasource",
+ "multi": false,
+ "name": "DS_PROMETHEUS",
+ "options": [],
+ "query": "prometheus",
+ "refresh": 1,
+ "regex": "",
+ "skipUrlSync": false,
+ "type": "datasource"
+ },
+ {
+ "allValue": null,
+ "current": {},
+ "datasource": "${DS_PROMETHEUS}",
+ "hide": 0,
+ "includeAll": false,
+ "label": "DC",
+ "multi": false,
+ "name": "datacenter",
+ "options": [],
+ "query": "label_values(nomad_client_uptime, datacenter)",
+ "refresh": 1,
+ "regex": "",
+ "skipUrlSync": false,
+ "sort": 0,
+ "tagValuesQuery": "",
+ "tags": [],
+ "tagsQuery": "",
+ "type": "query",
+ "useTags": false
+ },
+ {
+ "allValue": null,
+ "current": {},
+ "datasource": "${DS_PROMETHEUS}",
+ "hide": 0,
+ "includeAll": true,
+ "label": "Host",
+ "multi": true,
+ "name": "instance",
+ "options": [],
+ "query": "label_values(nomad_client_uptime{datacenter=~\"$datacenter\"}, instance)",
+ "refresh": 2,
+ "regex": "",
+ "skipUrlSync": false,
+ "sort": 0,
+ "tagValuesQuery": "",
+ "tags": [],
+ "tagsQuery": "",
+ "type": "query",
+ "useTags": false
+ }
+ ]
+ },
+ "time": {
+ "from": "now-4h",
+ "to": "now"
+ },
+ "timepicker": {
+ "refresh_intervals": [
+ "5s",
+ "10s",
+ "30s",
+ "1m",
+ "5m",
+ "15m",
+ "30m",
+ "1h",
+ "2h",
+ "1d"
+ ],
+ "time_options": [
+ "5m",
+ "15m",
+ "1h",
+ "6h",
+ "12h",
+ "24h",
+ "2d",
+ "7d",
+ "30d"
+ ]
+ },
+ "timezone": "browser",
+ "title": "Nomad",
+ "version": 1
+} \ No newline at end of file
diff --git a/terraform-ci-infra/1n_nmd/grafana/conf/nomad/grafana.hcl b/terraform-ci-infra/1n_nmd/grafana/conf/nomad/grafana.hcl
new file mode 100644
index 0000000000..7325c6aef4
--- /dev/null
+++ b/terraform-ci-infra/1n_nmd/grafana/conf/nomad/grafana.hcl
@@ -0,0 +1,331 @@
+job "${job_name}" {
+ # The "region" parameter specifies the region in which to execute the job.
+ # If omitted, this inherits the default region name of "global".
+ # region = "global"
+ #
+ # The "datacenters" parameter specifies the list of datacenters which should
+ # be considered when placing this task. This must be provided.
+ datacenters = "${datacenters}"
+
+ # The "type" parameter controls the type of job, which impacts the scheduler's
+ # decision on placement. This configuration is optional and defaults to
+ # "service". For a full list of job types and their differences, please see
+ # the online documentation.
+ #
+ # For more information, please see the online documentation at:
+ #
+ # https://www.nomadproject.io/docs/jobspec/schedulers
+ #
+ type = "service"
+
+ update {
+ # The "max_parallel" parameter specifies the maximum number of updates to
+ # perform in parallel. In this case, this specifies to update a single task
+ # at a time.
+ max_parallel = 1
+
+ health_check = "checks"
+
+ # The "min_healthy_time" parameter specifies the minimum time the allocation
+ # must be in the healthy state before it is marked as healthy and unblocks
+ # further allocations from being updated.
+ min_healthy_time = "10s"
+
+ # The "healthy_deadline" parameter specifies the deadline in which the
+ # allocation must be marked as healthy after which the allocation is
+ # automatically transitioned to unhealthy. Transitioning to unhealthy will
+ # fail the deployment and potentially roll back the job if "auto_revert" is
+ # set to true.
+ healthy_deadline = "3m"
+
+ # The "progress_deadline" parameter specifies the deadline in which an
+ # allocation must be marked as healthy. The deadline begins when the first
+ # allocation for the deployment is created and is reset whenever an allocation
+ # as part of the deployment transitions to a healthy state. If no allocation
+ # transitions to the healthy state before the progress deadline, the
+ # deployment is marked as failed.
+ progress_deadline = "10m"
+
+%{ if use_canary }
+ # The "canary" parameter specifies that changes to the job that would result
+ # in destructive updates should create the specified number of canaries
+ # without stopping any previous allocations. Once the operator determines the
+ # canaries are healthy, they can be promoted which unblocks a rolling update
+ # of the remaining allocations at a rate of "max_parallel".
+ #
+ # Further, setting "canary" equal to the count of the task group allows
+ # blue/green deployments. When the job is updated, a full set of the new
+ # version is deployed and upon promotion the old version is stopped.
+ canary = 1
+
+ # Specifies if the job should auto-promote to the canary version when all
+ # canaries become healthy during a deployment. Defaults to false which means
+ # canaries must be manually updated with the nomad deployment promote
+ # command.
+ auto_promote = true
+
+ # The "auto_revert" parameter specifies if the job should auto-revert to the
+ # last stable job on deployment failure. A job is marked as stable if all the
+ # allocations as part of its deployment were marked healthy.
+ auto_revert = true
+%{ endif }
+ }
+
+ # The "group" stanza defines a series of tasks that should be co-located on
+ # the same Nomad client. Any task within a group will be placed on the same
+ # client.
+ #
+ # For more information and examples on the "group" stanza, please see
+ # the online documentation at:
+ #
+ # https://www.nomadproject.io/docs/job-specification/group
+ #
+ group "prod-group1-${service_name}" {
+ # The "count" parameter specifies the number of the task groups that should
+ # be running under this group. This value must be non-negative and defaults
+ # to 1.
+ count = ${group_count}
+
+
+ # The constraint allows restricting the set of eligible nodes. Constraints
+ # may filter on attributes or client metadata.
+ #
+ # For more information and examples on the "volume" stanza, please see
+ # the online documentation at:
+ #
+ # https://www.nomadproject.io/docs/job-specification/constraint
+ #
+ constraint {
+ attribute = "$${attr.cpu.arch}"
+ operator = "!="
+ value = "arm64"
+ }
+
+ # The "task" stanza creates an individual unit of work, such as a Docker
+ # container, web application, or batch processing.
+ #
+ # For more information and examples on the "task" stanza, please see
+ # the online documentation at:
+ #
+ # https://www.nomadproject.io/docs/job-specification/task
+ #
+ task "prod-task1-${service_name}" {
+ # The "driver" parameter specifies the task driver that should be used to
+ # run the task.
+ driver = "docker"
+
+ # The "config" stanza specifies the driver configuration, which is passed
+ # directly to the driver to start the task. The details of configurations
+ # are specific to each driver, so please see specific driver
+ # documentation for more information.
+ config {
+ image = "${image}"
+ dns_servers = [ "$${attr.unique.network.ip-address}" ]
+ volumes = [
+ "secrets/prometheus.yml:/etc/grafana/provisioning/datasources/prometheus.yml",
+ "secrets/dashboards.yml:/etc/grafana/provisioning/dashboards/dashboards.yml",
+ "secrets/grafana.ini:/etc/grafana/grafana.ini",
+ "secrets/node_exporter.json:/etc/grafana/provisioning/dashboards/node_exporter.json",
+ "secrets/docker_cadvisor.json:/etc/grafana/provisioning/dashboards/docker_cadvisor.json",
+ "secrets/nomad.json:/etc/grafana/provisioning/dashboards/nomad.json",
+ "secrets/consul.json:/etc/grafana/provisioning/dashboards/consul.json",
+ "secrets/prometheus.json:/etc/grafana/provisioning/dashboards/prometheus.json",
+ "secrets/blackbox_exporter_http.json:/etc/grafana/provisioning/dashboards/blackbox_exporter_http.json",
+ "secrets/blackbox_exporter_icmp.json:/etc/grafana/provisioning/dashboards/blackbox_exporter_icmp.json"
+ ]
+ }
+
+ artifact {
+ # Prometheus Node Exporter
+ source = "https://raw.githubusercontent.com/pmikus/grafana-dashboards/main/node_exporter.json"
+ destination = "secrets/"
+ }
+
+ artifact {
+ # Docker cAdvisor
+ source = "https://raw.githubusercontent.com/pmikus/grafana-dashboards/main/docker_cadvisor.json"
+ destination = "secrets/"
+ }
+
+ artifact {
+ # Nomad
+ source = "https://raw.githubusercontent.com/pmikus/grafana-dashboards/main/nomad.json"
+ destination = "secrets/"
+ }
+
+ artifact {
+ # Consul
+ source = "https://raw.githubusercontent.com/pmikus/grafana-dashboards/main/consul.json"
+ destination = "secrets/"
+ }
+
+ artifact {
+ # Prometheus
+ source = "https://raw.githubusercontent.com/pmikus/grafana-dashboards/main/prometheus.json"
+ destination = "secrets/"
+ }
+
+ artifact {
+ # Prometheus Blackbox Exporter HTTP
+ source = "https://raw.githubusercontent.com/pmikus/grafana-dashboards/main/blackbox_exporter_http.json"
+ destination = "secrets/"
+ }
+
+ artifact {
+ # Prometheus Blackbox Exporter ICMP
+ source = "https://raw.githubusercontent.com/pmikus/grafana-dashboards/main/blackbox_exporter_icmp.json"
+ destination = "secrets/"
+ }
+
+ # The "template" stanza instructs Nomad to manage a template, such as
+ # a configuration file or script. This template can optionally pull data
+ # from Consul or Vault to populate runtime configuration data.
+ #
+ # For more information and examples on the "template" stanza, please see
+ # the online documentation at:
+ #
+ # https://www.nomadproject.io/docs/job-specification/template
+ #
+ template {
+ change_mode = "noop"
+ change_signal = "SIGINT"
+ destination = "secrets/prometheus.yml"
+ data = <<EOH
+apiVersion: 1
+datasources:
+- name: Prometheus
+ type: prometheus
+ access: direct
+ orgId: 1
+ url: http://prometheus.service.consul:9090
+ basicAuth: false
+ isDefault: true
+ version: 1
+ editable: false
+EOH
+ }
+
+ template {
+ change_mode = "noop"
+ change_signal = "SIGINT"
+ destination = "secrets/dashboards.yml"
+ data = <<EOH
+apiVersion: 1
+providers:
+- name: dashboards
+ type: file
+ disableDeletion: false
+ updateIntervalSeconds: 10
+ allowUiUpdates: false
+ options:
+ path: /etc/grafana/provisioning/dashboards
+ foldersFromFilesStructure: true
+EOH
+ }
+
+ template {
+ change_mode = "noop"
+ change_signal = "SIGINT"
+ destination = "secrets/grafana.ini"
+ data = <<EOH
+app_mode = production
+
+[metrics]
+enabled = true
+
+[server]
+protocol = http
+http_port = ${port}
+root_url = http://${service_name}.service.consul:${port}
+enable_gzip = true
+;cert_file =
+;cert_key =
+
+[security]
+admin_user = grafanauser
+admin_password = Grafana1234
+secret_key = SW2YcwTIb9zpOOhoPsMm
+
+[users]
+allow_sign_up = false
+allow_org_create = false
+auto_assign_org = true
+auto_assign_org_role = Viewer
+default_theme = dark
+
+[auth.basic]
+enabled = true
+
+[auth]
+disable_login_form = false
+disable_signout_menu = false
+
+[auth.anonymous]
+enabled = false
+
+[log]
+mode = console
+level = info
+
+[log.console]
+level = info
+format = console
+EOH
+ }
+
+ # The service stanza instructs Nomad to register a service with Consul.
+ #
+ # For more information and examples on the "task" stanza, please see
+ # the online documentation at:
+ #
+ # https://www.nomadproject.io/docs/job-specification/service
+ #
+ service {
+ name = "${service_name}"
+ port = "${service_name}"
+ tags = [ "${service_name}$${NOMAD_ALLOC_INDEX}" ]
+ check {
+ name = "Grafana Check Live"
+ type = "http"
+ protocol = "http"
+ tls_skip_verify = true
+ path = "/api/health"
+ interval = "10s"
+ timeout = "2s"
+ }
+ }
+
+ # The "resources" stanza describes the requirements a task needs to
+ # execute. Resource requirements include memory, network, cpu, and more.
+ # This ensures the task will execute on a machine that contains enough
+ # resource capacity.
+ #
+ # For more information and examples on the "resources" stanza, please see
+ # the online documentation at:
+ #
+ # https://www.nomadproject.io/docs/job-specification/resources
+ #
+ resources {
+ cpu = ${cpu}
+ memory = ${mem}
+ # The network stanza specifies the networking requirements for the task
+ # group, including the network mode and port allocations. When scheduling
+ # jobs in Nomad they are provisioned across your fleet of machines along
+ # with other jobs and services. Because you don't know in advance what host
+ # your job will be provisioned on, Nomad will provide your tasks with
+ # network configuration when they start up.
+ #
+ # For more information and examples on the "template" stanza, please see
+ # the online documentation at:
+ #
+ # https://www.nomadproject.io/docs/job-specification/network
+ #
+ network {
+ port "${service_name}" {
+ static = ${port}
+ }
+ }
+ }
+ }
+ }
+} \ No newline at end of file
diff --git a/terraform-ci-infra/1n_nmd/grafana/conf/prometheus.json b/terraform-ci-infra/1n_nmd/grafana/conf/prometheus.json
new file mode 100644
index 0000000000..0f07574b07
--- /dev/null
+++ b/terraform-ci-infra/1n_nmd/grafana/conf/prometheus.json
@@ -0,0 +1,3055 @@
+{
+ "__inputs": [
+ {
+ "name": "DS_PROMETHEUS",
+ "label": "prometheus",
+ "description": "A prometheus server with prometheus server metrics",
+ "type": "datasource",
+ "pluginId": "prometheus",
+ "pluginName": "Prometheus"
+ }
+ ],
+ "__requires": [
+ {
+ "type": "grafana",
+ "id": "grafana",
+ "name": "Grafana",
+ "version": "4.5.0-beta1"
+ },
+ {
+ "type": "panel",
+ "id": "graph",
+ "name": "Graph",
+ "version": ""
+ },
+ {
+ "type": "datasource",
+ "id": "prometheus",
+ "name": "Prometheus",
+ "version": "1.0.0"
+ },
+ {
+ "type": "panel",
+ "id": "singlestat",
+ "name": "Singlestat",
+ "version": ""
+ },
+ {
+ "type": "panel",
+ "id": "table",
+ "name": "Table",
+ "version": ""
+ }
+ ],
+ "annotations": {
+ "list": [
+ {
+ "datasource": "${DS_PROMETHEUS}",
+ "enable": true,
+ "expr": "sum(changes(prometheus_config_last_reload_success_timestamp_seconds{instance=~\"$instance\"}[10m])) by (instance)",
+ "hide": false,
+ "iconColor": "rgb(0, 96, 19)",
+ "limit": 100,
+ "name": "reloads",
+ "showIn": 0,
+ "step": "5m",
+ "type": "alert"
+ },
+ {
+ "datasource": "${DS_PROMETHEUS}",
+ "enable": true,
+ "expr": "count(sum(up{instance=\"$instance\"}) by (instance) < 1)",
+ "hide": false,
+ "iconColor": "rgba(255, 96, 96, 1)",
+ "limit": 100,
+ "name": "down",
+ "showIn": 0,
+ "step": "5m",
+ "type": "alert"
+ }
+ ]
+ },
+ "description": "Get started faster with Grafana Cloud then easily build these dashboards. https://grafana.com/products/cloud/\nOverview of metrics from Prometheus 2.0. \nUseful for using prometheus to monitor your prometheus.\nRevisions welcome!",
+ "editable": true,
+ "gnetId": 3662,
+ "graphTooltip": 0,
+ "hideControls": false,
+ "id": null,
+ "links": [],
+ "refresh": "30s",
+ "rows": [
+ {
+ "collapse": false,
+ "height": 250,
+ "panels": [
+ {
+ "cacheTimeout": null,
+ "colorBackground": false,
+ "colorValue": true,
+ "colors": [
+ "rgba(245, 54, 54, 0.9)",
+ "rgba(237, 129, 40, 0.89)",
+ "rgba(50, 172, 45, 0.97)"
+ ],
+ "datasource": "${DS_PROMETHEUS}",
+ "decimals": 3,
+ "description": "Percentage of uptime during the most recent $interval period. Change the period with the 'interval' dropdown above.",
+ "format": "none",
+ "gauge": {
+ "maxValue": 100,
+ "minValue": 0,
+ "show": false,
+ "thresholdLabels": false,
+ "thresholdMarkers": false
+ },
+ "id": 2,
+ "interval": null,
+ "links": [],
+ "mappingType": 1,
+ "mappingTypes": [
+ {
+ "name": "value to text",
+ "value": 1
+ },
+ {
+ "name": "range to text",
+ "value": 2
+ }
+ ],
+ "maxDataPoints": 100,
+ "nullPointMode": "connected",
+ "nullText": null,
+ "postfix": "%",
+ "postfixFontSize": "100%",
+ "prefix": "",
+ "prefixFontSize": "50%",
+ "rangeMaps": [
+ {
+ "from": "null",
+ "text": "N/A",
+ "to": "null"
+ }
+ ],
+ "span": 3,
+ "sparkline": {
+ "fillColor": "rgba(31, 118, 189, 0.18)",
+ "full": true,
+ "lineColor": "rgb(31, 120, 193)",
+ "show": false
+ },
+ "tableColumn": "",
+ "targets": [
+ {
+ "expr": "avg(avg_over_time(up{instance=~\"$instance\",job=~\"$job\"}[$interval]) * 100)",
+ "format": "time_series",
+ "intervalFactor": 2,
+ "legendFormat": "",
+ "refId": "A",
+ "step": 40
+ }
+ ],
+ "thresholds": "90, 99",
+ "title": "Uptime [$interval]",
+ "type": "singlestat",
+ "valueFontSize": "100%",
+ "valueMaps": [
+ {
+ "op": "=",
+ "text": "N/A",
+ "value": "null"
+ }
+ ],
+ "valueName": "current"
+ },
+ {
+ "columns": [],
+ "datasource": "${DS_PROMETHEUS}",
+ "description": "Servers which are DOWN RIGHT NOW! \nFIX THEM!!",
+ "fontSize": "100%",
+ "hideTimeOverride": true,
+ "id": 25,
+ "links": [],
+ "pageSize": null,
+ "scroll": true,
+ "showHeader": true,
+ "sort": {
+ "col": 0,
+ "desc": true
+ },
+ "span": 3,
+ "styles": [
+ {
+ "alias": "Time",
+ "dateFormat": "YYYY-MM-DD HH:mm:ss",
+ "pattern": "Time",
+ "type": "hidden"
+ },
+ {
+ "alias": "",
+ "colorMode": null,
+ "colors": [
+ "rgba(245, 54, 54, 0.9)",
+ "rgba(237, 129, 40, 0.89)",
+ "rgba(50, 172, 45, 0.97)"
+ ],
+ "dateFormat": "YYYY-MM-DD HH:mm:ss",
+ "decimals": 2,
+ "pattern": "/__name__|job|Value/",
+ "thresholds": [],
+ "type": "hidden",
+ "unit": "short"
+ },
+ {
+ "alias": " ",
+ "colorMode": "cell",
+ "colors": [
+ "rgba(255, 0, 0, 0.9)",
+ "rgba(237, 129, 40, 0.89)",
+ "rgba(255, 0, 0, 0.97)"
+ ],
+ "dateFormat": "YYYY-MM-DD HH:mm:ss",
+ "decimals": 2,
+ "link": false,
+ "pattern": "instance",
+ "thresholds": [
+ "",
+ "",
+ ""
+ ],
+ "type": "string",
+ "unit": "short"
+ }
+ ],
+ "targets": [
+ {
+ "expr": "up{instance=~\"$instance\",job=~\"$job\"} < 1",
+ "format": "table",
+ "intervalFactor": 2,
+ "refId": "A",
+ "step": 2
+ }
+ ],
+ "timeFrom": "1s",
+ "title": "Currently Down",
+ "transform": "table",
+ "type": "table"
+ },
+ {
+ "cacheTimeout": null,
+ "colorBackground": false,
+ "colorValue": true,
+ "colors": [
+ "rgba(50, 172, 45, 0.97)",
+ "rgba(237, 129, 40, 0.89)",
+ "rgba(245, 54, 54, 0.9)"
+ ],
+ "datasource": "${DS_PROMETHEUS}",
+ "description": "Total number of time series in prometheus",
+ "format": "none",
+ "gauge": {
+ "maxValue": 100,
+ "minValue": 0,
+ "show": false,
+ "thresholdLabels": false,
+ "thresholdMarkers": true
+ },
+ "id": 12,
+ "interval": null,
+ "links": [],
+ "mappingType": 1,
+ "mappingTypes": [
+ {
+ "name": "value to text",
+ "value": 1
+ },
+ {
+ "name": "range to text",
+ "value": 2
+ }
+ ],
+ "maxDataPoints": 100,
+ "nullPointMode": "connected",
+ "nullText": null,
+ "postfix": "",
+ "postfixFontSize": "50%",
+ "prefix": "",
+ "prefixFontSize": "50%",
+ "rangeMaps": [
+ {
+ "from": "null",
+ "text": "N/A",
+ "to": "null"
+ }
+ ],
+ "span": 3,
+ "sparkline": {
+ "fillColor": "rgba(31, 118, 189, 0.18)",
+ "full": true,
+ "lineColor": "rgb(31, 120, 193)",
+ "show": true
+ },
+ "tableColumn": "",
+ "targets": [
+ {
+ "expr": "sum(prometheus_tsdb_head_series{job=~\"$job\",instance=~\"$instance\"})",
+ "format": "time_series",
+ "intervalFactor": 2,
+ "refId": "B",
+ "step": 40
+ }
+ ],
+ "thresholds": "1000000,2000000",
+ "title": "Total Series",
+ "type": "singlestat",
+ "valueFontSize": "100%",
+ "valueMaps": [
+ {
+ "op": "=",
+ "text": "N/A",
+ "value": "null"
+ }
+ ],
+ "valueName": "current"
+ },
+ {
+ "cacheTimeout": null,
+ "colorBackground": false,
+ "colorValue": false,
+ "colors": [
+ "rgba(245, 54, 54, 0.9)",
+ "rgba(237, 129, 40, 0.89)",
+ "rgba(50, 172, 45, 0.97)"
+ ],
+ "datasource": "${DS_PROMETHEUS}",
+ "format": "none",
+ "gauge": {
+ "maxValue": 100,
+ "minValue": 0,
+ "show": false,
+ "thresholdLabels": false,
+ "thresholdMarkers": true
+ },
+ "id": 14,
+ "interval": null,
+ "links": [],
+ "mappingType": 1,
+ "mappingTypes": [
+ {
+ "name": "value to text",
+ "value": 1
+ },
+ {
+ "name": "range to text",
+ "value": 2
+ }
+ ],
+ "maxDataPoints": 100,
+ "nullPointMode": "connected",
+ "nullText": null,
+ "postfix": "",
+ "postfixFontSize": "50%",
+ "prefix": "",
+ "prefixFontSize": "50%",
+ "rangeMaps": [
+ {
+ "from": "null",
+ "text": "N/A",
+ "to": "null"
+ }
+ ],
+ "span": 3,
+ "sparkline": {
+ "fillColor": "rgba(31, 118, 189, 0.18)",
+ "full": true,
+ "lineColor": "rgb(31, 120, 193)",
+ "show": true
+ },
+ "tableColumn": "",
+ "targets": [
+ {
+ "expr": "sum(prometheus_tsdb_head_chunks{job=~\"$job\",instance=~\"$instance\"})",
+ "format": "time_series",
+ "intervalFactor": 2,
+ "refId": "B",
+ "step": 40
+ }
+ ],
+ "thresholds": "",
+ "title": "Memory Chunks",
+ "type": "singlestat",
+ "valueFontSize": "100%",
+ "valueMaps": [
+ {
+ "op": "=",
+ "text": "N/A",
+ "value": "null"
+ }
+ ],
+ "valueName": "current"
+ }
+ ],
+ "repeat": null,
+ "repeatIteration": null,
+ "repeatRowId": null,
+ "showTitle": false,
+ "title": "at a glance",
+ "titleSize": "h6"
+ },
+ {
+ "collapse": false,
+ "height": 236,
+ "panels": [
+ {
+ "cacheTimeout": null,
+ "colorBackground": false,
+ "colorValue": true,
+ "colors": [
+ "rgba(50, 172, 45, 0.97)",
+ "rgba(237, 129, 40, 0.89)",
+ "rgba(245, 54, 54, 0.9)"
+ ],
+ "datasource": "${DS_PROMETHEUS}",
+ "description": "The total number of rule group evaluations missed due to slow rule group evaluation.",
+ "format": "none",
+ "gauge": {
+ "maxValue": 100,
+ "minValue": 0,
+ "show": false,
+ "thresholdLabels": false,
+ "thresholdMarkers": true
+ },
+ "id": 16,
+ "interval": null,
+ "links": [],
+ "mappingType": 1,
+ "mappingTypes": [
+ {
+ "name": "value to text",
+ "value": 1
+ },
+ {
+ "name": "range to text",
+ "value": 2
+ }
+ ],
+ "maxDataPoints": 100,
+ "nullPointMode": "connected",
+ "nullText": null,
+ "postfix": "",
+ "postfixFontSize": "50%",
+ "prefix": "",
+ "prefixFontSize": "50%",
+ "rangeMaps": [
+ {
+ "from": "null",
+ "text": "N/A",
+ "to": "null"
+ }
+ ],
+ "span": 2,
+ "sparkline": {
+ "fillColor": "rgba(31, 118, 189, 0.18)",
+ "full": false,
+ "lineColor": "rgb(31, 120, 193)",
+ "show": false
+ },
+ "tableColumn": "",
+ "targets": [
+ {
+ "expr": "sum(sum_over_time(prometheus_evaluator_iterations_missed_total{job=~\"$job\",instance=~\"$instance\"}[$interval]))",
+ "format": "time_series",
+ "intervalFactor": 2,
+ "refId": "A",
+ "step": 40
+ }
+ ],
+ "thresholds": "1,10",
+ "title": "Missed Iterations [$interval]",
+ "type": "singlestat",
+ "valueFontSize": "100%",
+ "valueMaps": [
+ {
+ "op": "=",
+ "text": "N/A",
+ "value": "null"
+ }
+ ],
+ "valueName": "current"
+ },
+ {
+ "cacheTimeout": null,
+ "colorBackground": false,
+ "colorValue": true,
+ "colors": [
+ "rgba(50, 172, 45, 0.97)",
+ "rgba(237, 129, 40, 0.89)",
+ "rgba(245, 54, 54, 0.9)"
+ ],
+ "datasource": "${DS_PROMETHEUS}",
+ "description": "The total number of rule group evaluations skipped due to throttled metric storage.",
+ "format": "none",
+ "gauge": {
+ "maxValue": 100,
+ "minValue": 0,
+ "show": false,
+ "thresholdLabels": false,
+ "thresholdMarkers": true
+ },
+ "id": 18,
+ "interval": null,
+ "links": [],
+ "mappingType": 1,
+ "mappingTypes": [
+ {
+ "name": "value to text",
+ "value": 1
+ },
+ {
+ "name": "range to text",
+ "value": 2
+ }
+ ],
+ "maxDataPoints": 100,
+ "nullPointMode": "connected",
+ "nullText": null,
+ "postfix": "",
+ "postfixFontSize": "50%",
+ "prefix": "",
+ "prefixFontSize": "50%",
+ "rangeMaps": [
+ {
+ "from": "null",
+ "text": "N/A",
+ "to": "null"
+ }
+ ],
+ "span": 2,
+ "sparkline": {
+ "fillColor": "rgba(31, 118, 189, 0.18)",
+ "full": false,
+ "lineColor": "rgb(31, 120, 193)",
+ "show": false
+ },
+ "tableColumn": "",
+ "targets": [
+ {
+ "expr": "sum(sum_over_time(prometheus_evaluator_iterations_skipped_total{job=~\"$job\",instance=~\"$instance\"}[$interval]))",
+ "format": "time_series",
+ "intervalFactor": 2,
+ "refId": "A",
+ "step": 40
+ }
+ ],
+ "thresholds": "1,10",
+ "title": "Skipped Iterations [$interval]",
+ "type": "singlestat",
+ "valueFontSize": "100%",
+ "valueMaps": [
+ {
+ "op": "=",
+ "text": "N/A",
+ "value": "null"
+ }
+ ],
+ "valueName": "current"
+ },
+ {
+ "cacheTimeout": null,
+ "colorBackground": false,
+ "colorValue": true,
+ "colors": [
+ "rgba(50, 172, 45, 0.97)",
+ "rgba(237, 129, 40, 0.89)",
+ "rgba(245, 54, 54, 0.9)"
+ ],
+ "datasource": "${DS_PROMETHEUS}",
+ "description": "Total number of scrapes that hit the sample limit and were rejected.",
+ "format": "none",
+ "gauge": {
+ "maxValue": 100,
+ "minValue": 0,
+ "show": false,
+ "thresholdLabels": false,
+ "thresholdMarkers": true
+ },
+ "id": 19,
+ "interval": null,
+ "links": [],
+ "mappingType": 1,
+ "mappingTypes": [
+ {
+ "name": "value to text",
+ "value": 1
+ },
+ {
+ "name": "range to text",
+ "value": 2
+ }
+ ],
+ "maxDataPoints": 100,
+ "nullPointMode": "connected",
+ "nullText": null,
+ "postfix": "",
+ "postfixFontSize": "50%",
+ "prefix": "",
+ "prefixFontSize": "50%",
+ "rangeMaps": [
+ {
+ "from": "null",
+ "text": "N/A",
+ "to": "null"
+ }
+ ],
+ "span": 2,
+ "sparkline": {
+ "fillColor": "rgba(31, 118, 189, 0.18)",
+ "full": false,
+ "lineColor": "rgb(31, 120, 193)",
+ "show": false
+ },
+ "tableColumn": "",
+ "targets": [
+ {
+ "expr": "sum(sum_over_time(prometheus_target_scrapes_exceeded_sample_limit_total{job=~\"$job\",instance=~\"$instance\"}[$interval]))",
+ "format": "time_series",
+ "intervalFactor": 2,
+ "refId": "A",
+ "step": 40
+ }
+ ],
+ "thresholds": "1,10",
+ "title": "Tardy Scrapes [$interval]",
+ "type": "singlestat",
+ "valueFontSize": "100%",
+ "valueMaps": [
+ {
+ "op": "=",
+ "text": "N/A",
+ "value": "null"
+ }
+ ],
+ "valueName": "current"
+ },
+ {
+ "cacheTimeout": null,
+ "colorBackground": false,
+ "colorValue": true,
+ "colors": [
+ "rgba(50, 172, 45, 0.97)",
+ "rgba(237, 129, 40, 0.89)",
+ "rgba(245, 54, 54, 0.9)"
+ ],
+ "datasource": "${DS_PROMETHEUS}",
+ "description": "Number of times the database failed to reload block data from disk.",
+ "format": "none",
+ "gauge": {
+ "maxValue": 100,
+ "minValue": 0,
+ "show": false,
+ "thresholdLabels": false,
+ "thresholdMarkers": true
+ },
+ "id": 13,
+ "interval": null,
+ "links": [],
+ "mappingType": 1,
+ "mappingTypes": [
+ {
+ "name": "value to text",
+ "value": 1
+ },
+ {
+ "name": "range to text",
+ "value": 2
+ }
+ ],
+ "maxDataPoints": 100,
+ "nullPointMode": "connected",
+ "nullText": null,
+ "postfix": "",
+ "postfixFontSize": "50%",
+ "prefix": "",
+ "prefixFontSize": "50%",
+ "rangeMaps": [
+ {
+ "from": "null",
+ "text": "N/A",
+ "to": "null"
+ }
+ ],
+ "span": 2,
+ "sparkline": {
+ "fillColor": "rgba(31, 118, 189, 0.18)",
+ "full": false,
+ "lineColor": "rgb(31, 120, 193)",
+ "show": false
+ },
+ "tableColumn": "",
+ "targets": [
+ {
+ "expr": "sum(sum_over_time(prometheus_tsdb_reloads_failures_total{job=~\"$job\",instance=~\"$instance\"}[$interval]))",
+ "format": "time_series",
+ "intervalFactor": 2,
+ "refId": "A",
+ "step": 40
+ }
+ ],
+ "thresholds": "1,10",
+ "title": "Reload Failures [$interval]",
+ "type": "singlestat",
+ "valueFontSize": "100%",
+ "valueMaps": [
+ {
+ "op": "=",
+ "text": "N/A",
+ "value": "null"
+ }
+ ],
+ "valueName": "current"
+ },
+ {
+ "cacheTimeout": null,
+ "colorBackground": false,
+ "colorValue": true,
+ "colors": [
+ "rgba(50, 172, 45, 0.97)",
+ "rgba(237, 129, 40, 0.89)",
+ "rgba(245, 54, 54, 0.9)"
+ ],
+ "datasource": "${DS_PROMETHEUS}",
+ "description": "Sum of all skipped scrapes",
+ "format": "none",
+ "gauge": {
+ "maxValue": 100,
+ "minValue": 0,
+ "show": false,
+ "thresholdLabels": false,
+ "thresholdMarkers": true
+ },
+ "id": 20,
+ "interval": null,
+ "links": [],
+ "mappingType": 1,
+ "mappingTypes": [
+ {
+ "name": "value to text",
+ "value": 1
+ },
+ {
+ "name": "range to text",
+ "value": 2
+ }
+ ],
+ "maxDataPoints": 100,
+ "nullPointMode": "connected",
+ "nullText": null,
+ "postfix": "",
+ "postfixFontSize": "50%",
+ "prefix": "",
+ "prefixFontSize": "50%",
+ "rangeMaps": [
+ {
+ "from": "null",
+ "text": "N/A",
+ "to": "null"
+ }
+ ],
+ "span": 4,
+ "sparkline": {
+ "fillColor": "rgba(31, 118, 189, 0.18)",
+ "full": false,
+ "lineColor": "rgb(31, 120, 193)",
+ "show": false
+ },
+ "tableColumn": "",
+ "targets": [
+ {
+ "expr": "sum(sum_over_time(prometheus_target_scrapes_exceeded_sample_limit_total{job=~\"$job\",instance=~\"$instance\"}[$interval])) + \nsum(sum_over_time(prometheus_target_scrapes_sample_duplicate_timestamp_total{job=~\"$job\",instance=~\"$instance\"}[$interval])) + \nsum(sum_over_time(prometheus_target_scrapes_sample_out_of_bounds_total{job=~\"$job\",instance=~\"$instance\"}[$interval])) + \nsum(sum_over_time(prometheus_target_scrapes_sample_out_of_order_total{job=~\"$job\",instance=~\"$instance\"}[$interval])) ",
+ "format": "time_series",
+ "intervalFactor": 2,
+ "refId": "A",
+ "step": 40
+ }
+ ],
+ "thresholds": "1,10",
+ "title": "Skipped Scrapes [$interval]",
+ "type": "singlestat",
+ "valueFontSize": "100%",
+ "valueMaps": [
+ {
+ "op": "=",
+ "text": "N/A",
+ "value": "null"
+ }
+ ],
+ "valueName": "current"
+ }
+ ],
+ "repeat": null,
+ "repeatIteration": null,
+ "repeatRowId": null,
+ "showTitle": false,
+ "title": "quick numbers",
+ "titleSize": "h6"
+ },
+ {
+ "collapse": false,
+ "height": 250,
+ "panels": [
+ {
+ "aliasColors": {},
+ "bars": false,
+ "dashLength": 10,
+ "dashes": false,
+ "datasource": "${DS_PROMETHEUS}",
+ "description": "All non-zero failures and errors",
+ "fill": 1,
+ "id": 33,
+ "legend": {
+ "avg": false,
+ "current": false,
+ "max": false,
+ "min": false,
+ "show": true,
+ "total": false,
+ "values": false
+ },
+ "lines": true,
+ "linewidth": 1,
+ "links": [],
+ "nullPointMode": "null",
+ "percentage": false,
+ "pointradius": 5,
+ "points": false,
+ "renderer": "flot",
+ "seriesOverrides": [],
+ "spaceLength": 10,
+ "span": 12,
+ "stack": false,
+ "steppedLine": false,
+ "targets": [
+ {
+ "expr": "sum(increase(net_conntrack_dialer_conn_failed_total{instance=~\"$instance\"}[5m])) > 0",
+ "format": "time_series",
+ "intervalFactor": 2,
+ "legendFormat": "Failed Connections",
+ "refId": "A",
+ "step": 2
+ },
+ {
+ "expr": "sum(increase(prometheus_evaluator_iterations_missed_total{instance=~\"$instance\"}[5m])) > 0",
+ "format": "time_series",
+ "intervalFactor": 2,
+ "legendFormat": "Missed Iterations",
+ "refId": "B",
+ "step": 2
+ },
+ {
+ "expr": "sum(increase(prometheus_evaluator_iterations_skipped_total{instance=~\"$instance\"}[5m])) > 0",
+ "format": "time_series",
+ "intervalFactor": 2,
+ "legendFormat": "Skipped Iterations",
+ "refId": "C",
+ "step": 2
+ },
+ {
+ "expr": "sum(increase(prometheus_rule_evaluation_failures_total{instance=~\"$instance\"}[5m])) > 0",
+ "format": "time_series",
+ "intervalFactor": 2,
+ "legendFormat": "Evaluation",
+ "refId": "D",
+ "step": 2
+ },
+ {
+ "expr": "sum(increase(prometheus_sd_azure_refresh_failures_total{instance=~\"$instance\"}[5m])) > 0",
+ "format": "time_series",
+ "intervalFactor": 2,
+ "legendFormat": "Azure Refresh",
+ "refId": "E",
+ "step": 2
+ },
+ {
+ "expr": "sum(increase(prometheus_sd_consul_rpc_failures_total{instance=~\"$instance\"}[5m])) > 0",
+ "format": "time_series",
+ "intervalFactor": 2,
+ "legendFormat": "Consul RPC",
+ "refId": "F",
+ "step": 2
+ },
+ {
+ "expr": "sum(increase(prometheus_sd_dns_lookup_failures_total{instance=~\"$instance\"}[5m])) > 0",
+ "format": "time_series",
+ "intervalFactor": 2,
+ "legendFormat": "DNS Lookup",
+ "refId": "G",
+ "step": 2
+ },
+ {
+ "expr": "sum(increase(prometheus_sd_ec2_refresh_failures_total{instance=~\"$instance\"}[5m])) > 0",
+ "format": "time_series",
+ "intervalFactor": 2,
+ "legendFormat": "EC2 Refresh",
+ "refId": "H",
+ "step": 2
+ },
+ {
+ "expr": "sum(increase(prometheus_sd_gce_refresh_failures_total{instance=~\"$instance\"}[5m])) > 0",
+ "format": "time_series",
+ "intervalFactor": 2,
+ "legendFormat": "GCE Refresh",
+ "refId": "I",
+ "step": 2
+ },
+ {
+ "expr": "sum(increase(prometheus_sd_marathon_refresh_failures_total{instance=~\"$instance\"}[5m])) > 0",
+ "format": "time_series",
+ "intervalFactor": 2,
+ "legendFormat": "Marathon Refresh",
+ "refId": "J",
+ "step": 2
+ },
+ {
+ "expr": "sum(increase(prometheus_sd_openstack_refresh_failures_total{instance=~\"$instance\"}[5m])) > 0",
+ "format": "time_series",
+ "intervalFactor": 2,
+ "legendFormat": "Openstack Refresh",
+ "refId": "K",
+ "step": 2
+ },
+ {
+ "expr": "sum(increase(prometheus_sd_triton_refresh_failures_total{instance=~\"$instance\"}[5m])) > 0",
+ "format": "time_series",
+ "intervalFactor": 2,
+ "legendFormat": "Triton Refresh",
+ "refId": "L",
+ "step": 2
+ },
+ {
+ "expr": "sum(increase(prometheus_target_scrapes_exceeded_sample_limit_total{instance=~\"$instance\"}[5m])) > 0",
+ "format": "time_series",
+ "intervalFactor": 2,
+ "legendFormat": "Sample Limit",
+ "refId": "M",
+ "step": 2
+ },
+ {
+ "expr": "sum(increase(prometheus_target_scrapes_sample_duplicate_timestamp_total{instance=~\"$instance\"}[5m])) > 0",
+ "format": "time_series",
+ "intervalFactor": 2,
+ "legendFormat": "Duplicate Timestamp",
+ "refId": "N",
+ "step": 2
+ },
+ {
+ "expr": "sum(increase(prometheus_target_scrapes_sample_out_of_bounds_total{instance=~\"$instance\"}[5m])) > 0",
+ "format": "time_series",
+ "intervalFactor": 2,
+ "legendFormat": "Timestamp Out of Bounds",
+ "refId": "O",
+ "step": 2
+ },
+ {
+ "expr": "sum(increase(prometheus_target_scrapes_sample_out_of_order_total{instance=~\"$instance\"}[5m])) > 0",
+ "format": "time_series",
+ "intervalFactor": 2,
+ "legendFormat": "Sample Out of Order",
+ "refId": "P",
+ "step": 2
+ },
+ {
+ "expr": "sum(increase(prometheus_treecache_zookeeper_failures_total{instance=~\"$instance\"}[5m])) > 0",
+ "format": "time_series",
+ "intervalFactor": 2,
+ "legendFormat": "Zookeeper",
+ "refId": "Q",
+ "step": 2
+ },
+ {
+ "expr": "sum(increase(prometheus_tsdb_compactions_failed_total{instance=~\"$instance\"}[5m])) > 0",
+ "format": "time_series",
+ "intervalFactor": 2,
+ "legendFormat": "TSDB Compactions",
+ "refId": "R",
+ "step": 2
+ },
+ {
+ "expr": "sum(increase(prometheus_tsdb_head_series_not_found{instance=~\"$instance\"}[5m])) > 0",
+ "format": "time_series",
+ "intervalFactor": 2,
+ "legendFormat": "Series Not Found",
+ "refId": "S",
+ "step": 2
+ },
+ {
+ "expr": "sum(increase(prometheus_tsdb_reloads_failures_total{instance=~\"$instance\"}[5m])) > 0",
+ "format": "time_series",
+ "intervalFactor": 2,
+ "legendFormat": "Reload",
+ "refId": "T",
+ "step": 2
+ }
+ ],
+ "thresholds": [],
+ "timeFrom": null,
+ "timeShift": null,
+ "title": "Failures and Errors",
+ "tooltip": {
+ "shared": true,
+ "sort": 0,
+ "value_type": "individual"
+ },
+ "type": "graph",
+ "xaxis": {
+ "buckets": null,
+ "mode": "time",
+ "name": null,
+ "show": true,
+ "values": []
+ },
+ "yaxes": [
+ {
+ "format": "short",
+ "label": "Errors",
+ "logBase": 1,
+ "max": null,
+ "min": "0",
+ "show": true
+ },
+ {
+ "format": "short",
+ "label": null,
+ "logBase": 1,
+ "max": null,
+ "min": null,
+ "show": true
+ }
+ ]
+ }
+ ],
+ "repeat": null,
+ "repeatIteration": null,
+ "repeatRowId": null,
+ "showTitle": false,
+ "title": "errors",
+ "titleSize": "h6"
+ },
+ {
+ "collapse": false,
+ "height": "250px",
+ "panels": [
+ {
+ "aliasColors": {},
+ "bars": false,
+ "dashLength": 10,
+ "dashes": false,
+ "datasource": "${DS_PROMETHEUS}",
+ "fill": 1,
+ "id": 1,
+ "legend": {
+ "avg": false,
+ "current": false,
+ "max": false,
+ "min": false,
+ "show": true,
+ "total": false,
+ "values": false
+ },
+ "lines": true,
+ "linewidth": 1,
+ "links": [],
+ "nullPointMode": "null",
+ "percentage": false,
+ "pointradius": 5,
+ "points": false,
+ "renderer": "flot",
+ "seriesOverrides": [],
+ "spaceLength": 10,
+ "span": 6,
+ "stack": true,
+ "steppedLine": false,
+ "targets": [
+ {
+ "expr": "up{instance=~\"$instance\",job=~\"$job\"}",
+ "format": "time_series",
+ "interval": "",
+ "intervalFactor": 1,
+ "legendFormat": "{{instance}}",
+ "refId": "A",
+ "step": 2
+ }
+ ],
+ "thresholds": [],
+ "timeFrom": null,
+ "timeShift": null,
+ "title": "Upness (stacked)",
+ "tooltip": {
+ "shared": true,
+ "sort": 0,
+ "value_type": "individual"
+ },
+ "type": "graph",
+ "xaxis": {
+ "buckets": null,
+ "mode": "time",
+ "name": null,
+ "show": true,
+ "values": []
+ },
+ "yaxes": [
+ {
+ "decimals": 0,
+ "format": "none",
+ "label": "Up",
+ "logBase": 1,
+ "max": null,
+ "min": "0",
+ "show": true
+ },
+ {
+ "format": "short",
+ "label": null,
+ "logBase": 1,
+ "max": null,
+ "min": null,
+ "show": false
+ }
+ ]
+ },
+ {
+ "aliasColors": {},
+ "bars": false,
+ "dashLength": 10,
+ "dashes": false,
+ "datasource": "${DS_PROMETHEUS}",
+ "fill": 1,
+ "id": 5,
+ "legend": {
+ "avg": false,
+ "current": false,
+ "max": false,
+ "min": false,
+ "show": true,
+ "total": false,
+ "values": false
+ },
+ "lines": true,
+ "linewidth": 1,
+ "links": [],
+ "nullPointMode": "null",
+ "percentage": false,
+ "pointradius": 5,
+ "points": false,
+ "renderer": "flot",
+ "seriesOverrides": [],
+ "spaceLength": 10,
+ "span": 6,
+ "stack": false,
+ "steppedLine": false,
+ "targets": [
+ {
+ "expr": "prometheus_tsdb_head_chunks{job=~\"$job\",instance=~\"$instance\"}",
+ "format": "time_series",
+ "intervalFactor": 2,
+ "legendFormat": "{{instance}}",
+ "refId": "A",
+ "step": 4
+ }
+ ],
+ "thresholds": [],
+ "timeFrom": null,
+ "timeShift": null,
+ "title": "Storage Memory Chunks",
+ "tooltip": {
+ "shared": true,
+ "sort": 0,
+ "value_type": "individual"
+ },
+ "type": "graph",
+ "xaxis": {
+ "buckets": null,
+ "mode": "time",
+ "name": null,
+ "show": true,
+ "values": []
+ },
+ "yaxes": [
+ {
+ "format": "short",
+ "label": "Chunks",
+ "logBase": 1,
+ "max": null,
+ "min": "0",
+ "show": true
+ },
+ {
+ "format": "short",
+ "label": null,
+ "logBase": 1,
+ "max": null,
+ "min": null,
+ "show": true
+ }
+ ]
+ }
+ ],
+ "repeat": null,
+ "repeatIteration": null,
+ "repeatRowId": null,
+ "showTitle": false,
+ "title": "up",
+ "titleSize": "h6"
+ },
+ {
+ "collapse": false,
+ "height": 250,
+ "panels": [
+ {
+ "aliasColors": {},
+ "bars": false,
+ "dashLength": 10,
+ "dashes": false,
+ "datasource": "${DS_PROMETHEUS}",
+ "fill": 1,
+ "id": 3,
+ "legend": {
+ "avg": false,
+ "current": false,
+ "max": false,
+ "min": false,
+ "show": true,
+ "total": false,
+ "values": false
+ },
+ "lines": true,
+ "linewidth": 1,
+ "links": [],
+ "nullPointMode": "null",
+ "percentage": false,
+ "pointradius": 5,
+ "points": false,
+ "renderer": "flot",
+ "seriesOverrides": [],
+ "spaceLength": 10,
+ "span": 6,
+ "stack": false,
+ "steppedLine": false,
+ "targets": [
+ {
+ "expr": "prometheus_tsdb_head_series{job=~\"$job\",instance=~\"$instance\"}",
+ "format": "time_series",
+ "intervalFactor": 2,
+ "legendFormat": "{{instance}}",
+ "refId": "A",
+ "step": 4
+ }
+ ],
+ "thresholds": [],
+ "timeFrom": null,
+ "timeShift": null,
+ "title": "Series Count",
+ "tooltip": {
+ "shared": true,
+ "sort": 0,
+ "value_type": "individual"
+ },
+ "type": "graph",
+ "xaxis": {
+ "buckets": null,
+ "mode": "time",
+ "name": null,
+ "show": true,
+ "values": []
+ },
+ "yaxes": [
+ {
+ "format": "short",
+ "label": "Series",
+ "logBase": 1,
+ "max": null,
+ "min": "0",
+ "show": true
+ },
+ {
+ "format": "short",
+ "label": null,
+ "logBase": 1,
+ "max": null,
+ "min": null,
+ "show": true
+ }
+ ]
+ },
+ {
+ "aliasColors": {},
+ "bars": false,
+ "dashLength": 10,
+ "dashes": false,
+ "datasource": "${DS_PROMETHEUS}",
+ "fill": 1,
+ "id": 32,
+ "legend": {
+ "avg": false,
+ "current": false,
+ "max": false,
+ "min": false,
+ "show": true,
+ "total": false,
+ "values": false
+ },
+ "lines": true,
+ "linewidth": 1,
+ "links": [],
+ "nullPointMode": "null",
+ "percentage": false,
+ "pointradius": 5,
+ "points": false,
+ "renderer": "flot",
+ "seriesOverrides": [
+ {
+ "alias": "removed",
+ "transform": "negative-Y"
+ }
+ ],
+ "spaceLength": 10,
+ "span": 6,
+ "stack": false,
+ "steppedLine": false,
+ "targets": [
+ {
+ "expr": "sum( increase(prometheus_tsdb_head_series_created_total{instance=~\"$instance\"}[5m]) )",
+ "format": "time_series",
+ "intervalFactor": 2,
+ "legendFormat": "created",
+ "refId": "A",
+ "step": 4
+ },
+ {
+ "expr": "sum( increase(prometheus_tsdb_head_series_removed_total{instance=~\"$instance\"}[5m]) )",
+ "format": "time_series",
+ "intervalFactor": 2,
+ "legendFormat": "removed",
+ "refId": "B",
+ "step": 4
+ }
+ ],
+ "thresholds": [],
+ "timeFrom": null,
+ "timeShift": null,
+ "title": "Series Created / Removed",
+ "tooltip": {
+ "shared": true,
+ "sort": 0,
+ "value_type": "individual"
+ },
+ "type": "graph",
+ "xaxis": {
+ "buckets": null,
+ "mode": "time",
+ "name": null,
+ "show": true,
+ "values": []
+ },
+ "yaxes": [
+ {
+ "format": "short",
+ "label": "Series Count",
+ "logBase": 1,
+ "max": null,
+ "min": null,
+ "show": true
+ },
+ {
+ "format": "short",
+ "label": null,
+ "logBase": 1,
+ "max": null,
+ "min": null,
+ "show": true
+ }
+ ]
+ }
+ ],
+ "repeat": null,
+ "repeatIteration": null,
+ "repeatRowId": null,
+ "showTitle": false,
+ "title": "series",
+ "titleSize": "h6"
+ },
+ {
+ "collapse": false,
+ "height": 250,
+ "panels": [
+ {
+ "aliasColors": {
+ "10.58.3.10:80": "#BA43A9"
+ },
+ "bars": false,
+ "dashLength": 10,
+ "dashes": false,
+ "datasource": "${DS_PROMETHEUS}",
+ "description": "Rate of total number of appended samples",
+ "fill": 1,
+ "id": 4,
+ "legend": {
+ "avg": false,
+ "current": false,
+ "max": false,
+ "min": false,
+ "show": true,
+ "total": false,
+ "values": false
+ },
+ "lines": true,
+ "linewidth": 1,
+ "links": [],
+ "nullPointMode": "null",
+ "percentage": false,
+ "pointradius": 5,
+ "points": false,
+ "renderer": "flot",
+ "seriesOverrides": [],
+ "spaceLength": 10,
+ "span": 12,
+ "stack": false,
+ "steppedLine": false,
+ "targets": [
+ {
+ "expr": "rate(prometheus_tsdb_head_samples_appended_total{job=~\"$job\",instance=~\"$instance\"}[1m])",
+ "format": "time_series",
+ "intervalFactor": 2,
+ "legendFormat": "{{instance}}",
+ "refId": "A",
+ "step": 2
+ }
+ ],
+ "thresholds": [],
+ "timeFrom": null,
+ "timeShift": null,
+ "title": "Appended Samples per Second",
+ "tooltip": {
+ "shared": true,
+ "sort": 0,
+ "value_type": "individual"
+ },
+ "type": "graph",
+ "xaxis": {
+ "buckets": null,
+ "mode": "time",
+ "name": null,
+ "show": true,
+ "values": []
+ },
+ "yaxes": [
+ {
+ "format": "short",
+ "label": "Samples / Second",
+ "logBase": 1,
+ "max": null,
+ "min": "0",
+ "show": true
+ },
+ {
+ "format": "short",
+ "label": null,
+ "logBase": 1,
+ "max": null,
+ "min": null,
+ "show": true
+ }
+ ]
+ }
+ ],
+ "repeat": null,
+ "repeatIteration": null,
+ "repeatRowId": null,
+ "showTitle": false,
+ "title": "appended samples",
+ "titleSize": "h6"
+ },
+ {
+ "collapse": false,
+ "height": 250,
+ "panels": [
+ {
+ "aliasColors": {},
+ "bars": false,
+ "dashLength": 10,
+ "dashes": false,
+ "datasource": "${DS_PROMETHEUS}",
+ "description": "Total number of syncs that were executed on a scrape pool.",
+ "fill": 1,
+ "id": 6,
+ "legend": {
+ "avg": false,
+ "current": false,
+ "max": false,
+ "min": false,
+ "show": true,
+ "total": false,
+ "values": false
+ },
+ "lines": true,
+ "linewidth": 1,
+ "links": [],
+ "nullPointMode": "null",
+ "percentage": false,
+ "pointradius": 5,
+ "points": false,
+ "renderer": "flot",
+ "seriesOverrides": [],
+ "spaceLength": 10,
+ "span": 6,
+ "stack": false,
+ "steppedLine": false,
+ "targets": [
+ {
+ "expr": "sum(prometheus_target_scrape_pool_sync_total{job=~\"$job\",instance=~\"$instance\"}) by (scrape_job)",
+ "format": "time_series",
+ "hide": false,
+ "intervalFactor": 2,
+ "legendFormat": "{{scrape_job}}",
+ "refId": "B",
+ "step": 4
+ }
+ ],
+ "thresholds": [],
+ "timeFrom": null,
+ "timeShift": null,
+ "title": "Scrape Sync Total",
+ "tooltip": {
+ "shared": true,
+ "sort": 0,
+ "value_type": "individual"
+ },
+ "type": "graph",
+ "xaxis": {
+ "buckets": null,
+ "mode": "time",
+ "name": null,
+ "show": true,
+ "values": []
+ },
+ "yaxes": [
+ {
+ "format": "short",
+ "label": "Syncs",
+ "logBase": 1,
+ "max": null,
+ "min": "0",
+ "show": true
+ },
+ {
+ "format": "short",
+ "label": null,
+ "logBase": 1,
+ "max": null,
+ "min": null,
+ "show": true
+ }
+ ]
+ },
+ {
+ "aliasColors": {},
+ "bars": false,
+ "dashLength": 10,
+ "dashes": false,
+ "datasource": "${DS_PROMETHEUS}",
+ "description": "Actual interval to sync the scrape pool.",
+ "fill": 1,
+ "id": 21,
+ "legend": {
+ "avg": false,
+ "current": false,
+ "max": false,
+ "min": false,
+ "show": true,
+ "total": false,
+ "values": false
+ },
+ "lines": true,
+ "linewidth": 1,
+ "links": [],
+ "nullPointMode": "null",
+ "percentage": false,
+ "pointradius": 5,
+ "points": false,
+ "renderer": "flot",
+ "seriesOverrides": [],
+ "spaceLength": 10,
+ "span": 6,
+ "stack": false,
+ "steppedLine": false,
+ "targets": [
+ {
+ "expr": "sum(rate(prometheus_target_sync_length_seconds_sum{job=~\"$job\",instance=~\"$instance\"}[2m])) by (scrape_job) * 1000",
+ "format": "time_series",
+ "intervalFactor": 2,
+ "legendFormat": "{{scrape_job}}",
+ "refId": "A",
+ "step": 4
+ }
+ ],
+ "thresholds": [],
+ "timeFrom": null,
+ "timeShift": null,
+ "title": "Target Sync",
+ "tooltip": {
+ "shared": true,
+ "sort": 0,
+ "value_type": "individual"
+ },
+ "type": "graph",
+ "xaxis": {
+ "buckets": null,
+ "mode": "time",
+ "name": null,
+ "show": true,
+ "values": []
+ },
+ "yaxes": [
+ {
+ "format": "short",
+ "label": "Milliseconds",
+ "logBase": 1,
+ "max": null,
+ "min": "0",
+ "show": true
+ },
+ {
+ "format": "short",
+ "label": null,
+ "logBase": 1,
+ "max": null,
+ "min": null,
+ "show": true
+ }
+ ]
+ }
+ ],
+ "repeat": null,
+ "repeatIteration": null,
+ "repeatRowId": null,
+ "showTitle": false,
+ "title": "sync",
+ "titleSize": "h6"
+ },
+ {
+ "collapse": false,
+ "height": 250,
+ "panels": [
+ {
+ "aliasColors": {},
+ "bars": false,
+ "dashLength": 10,
+ "dashes": false,
+ "datasource": "${DS_PROMETHEUS}",
+ "fill": 1,
+ "id": 29,
+ "legend": {
+ "avg": false,
+ "current": false,
+ "max": false,
+ "min": false,
+ "show": true,
+ "total": false,
+ "values": false
+ },
+ "lines": true,
+ "linewidth": 1,
+ "links": [],
+ "nullPointMode": "null",
+ "percentage": false,
+ "pointradius": 5,
+ "points": false,
+ "renderer": "flot",
+ "seriesOverrides": [],
+ "spaceLength": 10,
+ "span": 6,
+ "stack": false,
+ "steppedLine": false,
+ "targets": [
+ {
+ "expr": "scrape_duration_seconds{instance=~\"$instance\"}",
+ "format": "time_series",
+ "intervalFactor": 2,
+ "legendFormat": "{{instance}}",
+ "refId": "A",
+ "step": 4
+ }
+ ],
+ "thresholds": [],
+ "timeFrom": null,
+ "timeShift": null,
+ "title": "Scrape Duration",
+ "tooltip": {
+ "shared": true,
+ "sort": 0,
+ "value_type": "individual"
+ },
+ "type": "graph",
+ "xaxis": {
+ "buckets": null,
+ "mode": "time",
+ "name": null,
+ "show": true,
+ "values": []
+ },
+ "yaxes": [
+ {
+ "format": "short",
+ "label": "Seconds",
+ "logBase": 1,
+ "max": null,
+ "min": null,
+ "show": true
+ },
+ {
+ "format": "short",
+ "label": null,
+ "logBase": 1,
+ "max": null,
+ "min": null,
+ "show": true
+ }
+ ]
+ },
+ {
+ "aliasColors": {},
+ "bars": false,
+ "dashLength": 10,
+ "dashes": false,
+ "datasource": "${DS_PROMETHEUS}",
+ "description": "Total number of rejected scrapes",
+ "fill": 1,
+ "id": 30,
+ "legend": {
+ "avg": false,
+ "current": false,
+ "max": false,
+ "min": false,
+ "show": true,
+ "total": false,
+ "values": false
+ },
+ "lines": true,
+ "linewidth": 1,
+ "links": [],
+ "nullPointMode": "null",
+ "percentage": false,
+ "pointradius": 5,
+ "points": false,
+ "renderer": "flot",
+ "seriesOverrides": [],
+ "spaceLength": 10,
+ "span": 6,
+ "stack": false,
+ "steppedLine": false,
+ "targets": [
+ {
+ "expr": "sum(prometheus_target_scrapes_exceeded_sample_limit_total{job=~\"$job\",instance=~\"$instance\"})",
+ "format": "time_series",
+ "intervalFactor": 2,
+ "legendFormat": "exceeded sample limit",
+ "refId": "A",
+ "step": 4
+ },
+ {
+ "expr": "sum(prometheus_target_scrapes_sample_duplicate_timestamp_total{job=~\"$job\",instance=~\"$instance\"})",
+ "format": "time_series",
+ "hide": false,
+ "intervalFactor": 2,
+ "legendFormat": "duplicate timestamp",
+ "refId": "B",
+ "step": 4
+ },
+ {
+ "expr": "sum(prometheus_target_scrapes_sample_out_of_bounds_total{job=~\"$job\",instance=~\"$instance\"})",
+ "format": "time_series",
+ "hide": false,
+ "intervalFactor": 2,
+ "legendFormat": "out of bounds",
+ "refId": "C",
+ "step": 4
+ },
+ {
+ "expr": "sum(prometheus_target_scrapes_sample_out_of_order_total{job=~\"$job\",instance=~\"$instance\"}) ",
+ "format": "time_series",
+ "hide": false,
+ "intervalFactor": 2,
+ "legendFormat": "out of order",
+ "refId": "D",
+ "step": 4
+ }
+ ],
+ "thresholds": [],
+ "timeFrom": null,
+ "timeShift": null,
+ "title": "Rejected Scrapes",
+ "tooltip": {
+ "shared": true,
+ "sort": 0,
+ "value_type": "individual"
+ },
+ "type": "graph",
+ "xaxis": {
+ "buckets": null,
+ "mode": "time",
+ "name": null,
+ "show": true,
+ "values": []
+ },
+ "yaxes": [
+ {
+ "decimals": 0,
+ "format": "short",
+ "label": "Scrapes",
+ "logBase": 1,
+ "max": null,
+ "min": "0",
+ "show": true
+ },
+ {
+ "format": "short",
+ "label": null,
+ "logBase": 1,
+ "max": null,
+ "min": null,
+ "show": true
+ }
+ ]
+ }
+ ],
+ "repeat": null,
+ "repeatIteration": null,
+ "repeatRowId": null,
+ "showTitle": false,
+ "title": "scrapes",
+ "titleSize": "h6"
+ },
+ {
+ "collapse": false,
+ "height": 250,
+ "panels": [
+ {
+ "aliasColors": {},
+ "bars": false,
+ "dashLength": 10,
+ "dashes": false,
+ "datasource": "${DS_PROMETHEUS}",
+ "description": "The duration of rule group evaluations",
+ "fill": 1,
+ "id": 10,
+ "legend": {
+ "avg": false,
+ "current": false,
+ "max": false,
+ "min": false,
+ "show": true,
+ "total": false,
+ "values": false
+ },
+ "lines": true,
+ "linewidth": 1,
+ "links": [],
+ "nullPointMode": "null",
+ "percentage": false,
+ "pointradius": 5,
+ "points": false,
+ "renderer": "flot",
+ "seriesOverrides": [],
+ "spaceLength": 10,
+ "span": 6,
+ "stack": false,
+ "steppedLine": false,
+ "targets": [
+ {
+ "expr": "1000 * rate(prometheus_evaluator_duration_seconds_sum{job=~\"$job\", instance=~\"$instance\"}[5m]) / rate(prometheus_evaluator_duration_seconds_count{job=~\"$job\", instance=~\"$instance\"}[5m])",
+ "format": "time_series",
+ "intervalFactor": 2,
+ "legendFormat": "{{instance}}",
+ "refId": "E",
+ "step": 4
+ }
+ ],
+ "thresholds": [],
+ "timeFrom": null,
+ "timeShift": null,
+ "title": "Average Rule Evaluation Duration",
+ "tooltip": {
+ "shared": true,
+ "sort": 0,
+ "value_type": "individual"
+ },
+ "type": "graph",
+ "xaxis": {
+ "buckets": null,
+ "mode": "time",
+ "name": null,
+ "show": true,
+ "values": []
+ },
+ "yaxes": [
+ {
+ "format": "short",
+ "label": "Milliseconds",
+ "logBase": 1,
+ "max": null,
+ "min": "0",
+ "show": true
+ },
+ {
+ "format": "short",
+ "label": null,
+ "logBase": 1,
+ "max": null,
+ "min": null,
+ "show": true
+ }
+ ]
+ },
+ {
+ "aliasColors": {},
+ "bars": false,
+ "dashLength": 10,
+ "dashes": false,
+ "datasource": "${DS_PROMETHEUS}",
+ "fill": 1,
+ "id": 11,
+ "legend": {
+ "avg": false,
+ "current": false,
+ "max": false,
+ "min": false,
+ "show": true,
+ "total": false,
+ "values": false
+ },
+ "lines": true,
+ "linewidth": 1,
+ "links": [],
+ "nullPointMode": "null",
+ "percentage": false,
+ "pointradius": 5,
+ "points": false,
+ "renderer": "flot",
+ "seriesOverrides": [],
+ "spaceLength": 10,
+ "span": 6,
+ "stack": false,
+ "steppedLine": false,
+ "targets": [
+ {
+ "expr": "sum(rate(http_request_duration_microseconds_count{job=~\"$job\",instance=~\"$instance\"}[1m])) by (handler) > 0",
+ "format": "time_series",
+ "intervalFactor": 2,
+ "legendFormat": "{{handler}}",
+ "refId": "A",
+ "step": 4
+ }
+ ],
+ "thresholds": [],
+ "timeFrom": null,
+ "timeShift": null,
+ "title": "HTTP Request Duration",
+ "tooltip": {
+ "shared": true,
+ "sort": 0,
+ "value_type": "individual"
+ },
+ "type": "graph",
+ "xaxis": {
+ "buckets": null,
+ "mode": "time",
+ "name": null,
+ "show": true,
+ "values": []
+ },
+ "yaxes": [
+ {
+ "format": "short",
+ "label": "Microseconds",
+ "logBase": 1,
+ "max": null,
+ "min": null,
+ "show": true
+ },
+ {
+ "format": "short",
+ "label": null,
+ "logBase": 1,
+ "max": null,
+ "min": null,
+ "show": true
+ }
+ ]
+ },
+ {
+ "aliasColors": {},
+ "bars": false,
+ "dashLength": 10,
+ "dashes": false,
+ "datasource": "${DS_PROMETHEUS}",
+ "fill": 1,
+ "id": 15,
+ "legend": {
+ "avg": false,
+ "current": false,
+ "max": false,
+ "min": false,
+ "show": true,
+ "total": false,
+ "values": false
+ },
+ "lines": true,
+ "linewidth": 1,
+ "links": [],
+ "nullPointMode": "null",
+ "percentage": false,
+ "pointradius": 5,
+ "points": false,
+ "renderer": "flot",
+ "seriesOverrides": [],
+ "spaceLength": 10,
+ "span": 6,
+ "stack": false,
+ "steppedLine": false,
+ "targets": [
+ {
+ "expr": "sum(prometheus_engine_query_duration_seconds_sum{job=~\"$job\",instance=~\"$instance\"}) by (slice)",
+ "format": "time_series",
+ "intervalFactor": 2,
+ "legendFormat": "{{slice}}",
+ "refId": "A",
+ "step": 4
+ }
+ ],
+ "thresholds": [],
+ "timeFrom": null,
+ "timeShift": null,
+ "title": "Prometheus Engine Query Duration Seconds",
+ "tooltip": {
+ "shared": true,
+ "sort": 0,
+ "value_type": "individual"
+ },
+ "type": "graph",
+ "xaxis": {
+ "buckets": null,
+ "mode": "time",
+ "name": null,
+ "show": true,
+ "values": []
+ },
+ "yaxes": [
+ {
+ "format": "short",
+ "label": "Seconds",
+ "logBase": 1,
+ "max": null,
+ "min": "0",
+ "show": true
+ },
+ {
+ "format": "short",
+ "label": null,
+ "logBase": 1,
+ "max": null,
+ "min": null,
+ "show": true
+ }
+ ]
+ },
+ {
+ "aliasColors": {},
+ "bars": false,
+ "dashLength": 10,
+ "dashes": false,
+ "datasource": "${DS_PROMETHEUS}",
+ "description": "Rule-group evaluations \n - total\n - missed due to slow rule group evaluation\n - skipped due to throttled metric storage",
+ "fill": 1,
+ "id": 31,
+ "legend": {
+ "avg": false,
+ "current": false,
+ "max": false,
+ "min": false,
+ "show": true,
+ "total": false,
+ "values": false
+ },
+ "lines": true,
+ "linewidth": 1,
+ "links": [],
+ "nullPointMode": "null",
+ "percentage": false,
+ "pointradius": 5,
+ "points": false,
+ "renderer": "flot",
+ "seriesOverrides": [],
+ "spaceLength": 10,
+ "span": 6,
+ "stack": false,
+ "steppedLine": false,
+ "targets": [
+ {
+ "expr": "sum(rate(prometheus_evaluator_iterations_total{job=~\"$job\", instance=~\"$instance\"}[5m]))",
+ "format": "time_series",
+ "intervalFactor": 2,
+ "legendFormat": "Total",
+ "refId": "B",
+ "step": 4
+ },
+ {
+ "expr": "sum(rate(prometheus_evaluator_iterations_missed_total{job=~\"$job\", instance=~\"$instance\"}[5m]))",
+ "format": "time_series",
+ "intervalFactor": 2,
+ "legendFormat": "Missed",
+ "refId": "A",
+ "step": 4
+ },
+ {
+ "expr": "sum(rate(prometheus_evaluator_iterations_skipped_total{job=~\"$job\", instance=~\"$instance\"}[5m]))",
+ "format": "time_series",
+ "intervalFactor": 2,
+ "legendFormat": "Skipped",
+ "refId": "C",
+ "step": 4
+ }
+ ],
+ "thresholds": [],
+ "timeFrom": null,
+ "timeShift": null,
+ "title": "Rule Evaluator Iterations",
+ "tooltip": {
+ "shared": true,
+ "sort": 0,
+ "value_type": "individual"
+ },
+ "type": "graph",
+ "xaxis": {
+ "buckets": null,
+ "mode": "time",
+ "name": null,
+ "show": true,
+ "values": []
+ },
+ "yaxes": [
+ {
+ "format": "short",
+ "label": "iterations",
+ "logBase": 1,
+ "max": null,
+ "min": "0",
+ "show": true
+ },
+ {
+ "format": "short",
+ "label": null,
+ "logBase": 1,
+ "max": null,
+ "min": null,
+ "show": true
+ }
+ ]
+ }
+ ],
+ "repeat": null,
+ "repeatIteration": null,
+ "repeatRowId": null,
+ "showTitle": false,
+ "title": "durations",
+ "titleSize": "h6"
+ },
+ {
+ "collapse": false,
+ "height": 250,
+ "panels": [
+ {
+ "aliasColors": {},
+ "bars": false,
+ "dashLength": 10,
+ "dashes": false,
+ "datasource": "${DS_PROMETHEUS}",
+ "fill": 1,
+ "id": 22,
+ "legend": {
+ "avg": false,
+ "current": false,
+ "max": false,
+ "min": false,
+ "show": true,
+ "total": false,
+ "values": false
+ },
+ "lines": true,
+ "linewidth": 1,
+ "links": [],
+ "nullPointMode": "null",
+ "percentage": false,
+ "pointradius": 5,
+ "points": false,
+ "renderer": "flot",
+ "seriesOverrides": [],
+ "spaceLength": 10,
+ "span": 12,
+ "stack": false,
+ "steppedLine": false,
+ "targets": [
+ {
+ "expr": "rate(prometheus_notifications_sent_total[5m])",
+ "format": "time_series",
+ "intervalFactor": 2,
+ "legendFormat": "{{instance}}",
+ "refId": "A",
+ "step": 2
+ }
+ ],
+ "thresholds": [],
+ "timeFrom": null,
+ "timeShift": null,
+ "title": "Notifications Sent",
+ "tooltip": {
+ "shared": true,
+ "sort": 0,
+ "value_type": "individual"
+ },
+ "type": "graph",
+ "xaxis": {
+ "buckets": null,
+ "mode": "time",
+ "name": null,
+ "show": true,
+ "values": []
+ },
+ "yaxes": [
+ {
+ "format": "short",
+ "label": "Notifications",
+ "logBase": 1,
+ "max": null,
+ "min": "0",
+ "show": true
+ },
+ {
+ "format": "short",
+ "label": null,
+ "logBase": 1,
+ "max": null,
+ "min": null,
+ "show": true
+ }
+ ]
+ }
+ ],
+ "repeat": null,
+ "repeatIteration": null,
+ "repeatRowId": null,
+ "showTitle": false,
+ "title": "notifications",
+ "titleSize": "h6"
+ },
+ {
+ "collapse": false,
+ "height": 250,
+ "panels": [
+ {
+ "aliasColors": {},
+ "bars": false,
+ "dashLength": 10,
+ "dashes": false,
+ "datasource": "${DS_PROMETHEUS}",
+ "fill": 1,
+ "id": 23,
+ "legend": {
+ "avg": false,
+ "current": false,
+ "max": false,
+ "min": false,
+ "show": true,
+ "total": false,
+ "values": false
+ },
+ "lines": true,
+ "linewidth": 1,
+ "links": [],
+ "nullPointMode": "null",
+ "percentage": false,
+ "pointradius": 5,
+ "points": false,
+ "renderer": "flot",
+ "seriesOverrides": [],
+ "spaceLength": 10,
+ "span": 6,
+ "stack": false,
+ "steppedLine": false,
+ "targets": [
+ {
+ "expr": "(time() - prometheus_config_last_reload_success_timestamp_seconds{job=~\"$job\",instance=~\"$instance\"}) / 60",
+ "format": "time_series",
+ "intervalFactor": 2,
+ "legendFormat": "{{instance}}",
+ "refId": "A",
+ "step": 4
+ }
+ ],
+ "thresholds": [],
+ "timeFrom": null,
+ "timeShift": null,
+ "title": "Minutes Since Successful Config Reload",
+ "tooltip": {
+ "shared": true,
+ "sort": 0,
+ "value_type": "individual"
+ },
+ "type": "graph",
+ "xaxis": {
+ "buckets": null,
+ "mode": "time",
+ "name": null,
+ "show": true,
+ "values": []
+ },
+ "yaxes": [
+ {
+ "format": "short",
+ "label": "Minutes",
+ "logBase": 1,
+ "max": null,
+ "min": "0",
+ "show": true
+ },
+ {
+ "format": "short",
+ "label": null,
+ "logBase": 1,
+ "max": null,
+ "min": null,
+ "show": true
+ }
+ ]
+ },
+ {
+ "aliasColors": {},
+ "bars": false,
+ "dashLength": 10,
+ "dashes": false,
+ "datasource": "${DS_PROMETHEUS}",
+ "fill": 1,
+ "id": 24,
+ "legend": {
+ "avg": false,
+ "current": false,
+ "max": false,
+ "min": false,
+ "show": true,
+ "total": false,
+ "values": false
+ },
+ "lines": true,
+ "linewidth": 1,
+ "links": [],
+ "nullPointMode": "null",
+ "percentage": false,
+ "pointradius": 5,
+ "points": false,
+ "renderer": "flot",
+ "seriesOverrides": [],
+ "spaceLength": 10,
+ "span": 6,
+ "stack": false,
+ "steppedLine": false,
+ "targets": [
+ {
+ "expr": "prometheus_config_last_reload_successful{job=~\"$job\",instance=~\"$instance\"}",
+ "format": "time_series",
+ "intervalFactor": 2,
+ "legendFormat": "{{instance}}",
+ "refId": "A",
+ "step": 4
+ }
+ ],
+ "thresholds": [],
+ "timeFrom": null,
+ "timeShift": null,
+ "title": "Successful Config Reload",
+ "tooltip": {
+ "shared": true,
+ "sort": 0,
+ "value_type": "individual"
+ },
+ "type": "graph",
+ "xaxis": {
+ "buckets": null,
+ "mode": "time",
+ "name": null,
+ "show": true,
+ "values": []
+ },
+ "yaxes": [
+ {
+ "decimals": 0,
+ "format": "short",
+ "label": "Success",
+ "logBase": 1,
+ "max": "1",
+ "min": "0",
+ "show": true
+ },
+ {
+ "format": "short",
+ "label": null,
+ "logBase": 1,
+ "max": null,
+ "min": null,
+ "show": true
+ }
+ ]
+ }
+ ],
+ "repeat": null,
+ "repeatIteration": null,
+ "repeatRowId": null,
+ "showTitle": false,
+ "title": "config",
+ "titleSize": "h6"
+ },
+ {
+ "collapse": false,
+ "height": 250,
+ "panels": [
+ {
+ "aliasColors": {},
+ "bars": false,
+ "dashLength": 10,
+ "dashes": false,
+ "datasource": "${DS_PROMETHEUS}",
+ "description": "GC invocation durations",
+ "fill": 1,
+ "id": 28,
+ "legend": {
+ "avg": false,
+ "current": false,
+ "max": false,
+ "min": false,
+ "show": true,
+ "total": false,
+ "values": false
+ },
+ "lines": true,
+ "linewidth": 1,
+ "links": [],
+ "nullPointMode": "null",
+ "percentage": false,
+ "pointradius": 5,
+ "points": false,
+ "renderer": "flot",
+ "seriesOverrides": [],
+ "spaceLength": 10,
+ "span": 12,
+ "stack": false,
+ "steppedLine": false,
+ "targets": [
+ {
+ "expr": "sum(rate(go_gc_duration_seconds_sum{instance=~\"$instance\",job=~\"$job\"}[2m])) by (instance)",
+ "format": "time_series",
+ "intervalFactor": 2,
+ "legendFormat": "{{instance}}",
+ "refId": "A",
+ "step": 2
+ }
+ ],
+ "thresholds": [],
+ "timeFrom": null,
+ "timeShift": null,
+ "title": "GC Rate / 2m",
+ "tooltip": {
+ "shared": true,
+ "sort": 0,
+ "value_type": "individual"
+ },
+ "type": "graph",
+ "xaxis": {
+ "buckets": null,
+ "mode": "time",
+ "name": null,
+ "show": true,
+ "values": []
+ },
+ "yaxes": [
+ {
+ "format": "short",
+ "label": null,
+ "logBase": 1,
+ "max": null,
+ "min": null,
+ "show": true
+ },
+ {
+ "format": "short",
+ "label": null,
+ "logBase": 1,
+ "max": null,
+ "min": null,
+ "show": true
+ }
+ ]
+ }
+ ],
+ "repeat": null,
+ "repeatIteration": null,
+ "repeatRowId": null,
+ "showTitle": false,
+ "title": "garbage collection",
+ "titleSize": "h6"
+ },
+ {
+ "collapse": true,
+ "height": 250,
+ "panels": [
+ {
+ "aliasColors": {},
+ "bars": false,
+ "dashLength": 10,
+ "dashes": false,
+ "datasource": "${DS_PROMETHEUS}",
+ "description": "This is probably wrong! Please help.",
+ "fill": 1,
+ "id": 26,
+ "legend": {
+ "avg": false,
+ "current": false,
+ "max": false,
+ "min": false,
+ "show": true,
+ "total": false,
+ "values": false
+ },
+ "lines": true,
+ "linewidth": 1,
+ "links": [],
+ "nullPointMode": "null",
+ "percentage": false,
+ "pointradius": 5,
+ "points": false,
+ "renderer": "flot",
+ "seriesOverrides": [
+ {
+ "alias": "allocated",
+ "stack": false
+ }
+ ],
+ "spaceLength": 10,
+ "span": 6,
+ "stack": true,
+ "steppedLine": false,
+ "targets": [
+ {
+ "expr": "sum(go_memstats_alloc_bytes_total{job=~\"$job\", instance=~\"$instance\"})",
+ "format": "time_series",
+ "hide": true,
+ "intervalFactor": 2,
+ "legendFormat": "alloc_bytes_total",
+ "refId": "A",
+ "step": 10
+ },
+ {
+ "expr": "sum(go_memstats_alloc_bytes{job=~\"$job\", instance=~\"$instance\"})",
+ "format": "time_series",
+ "hide": false,
+ "intervalFactor": 2,
+ "legendFormat": "allocated",
+ "refId": "B",
+ "step": 10
+ },
+ {
+ "expr": "sum(go_memstats_buck_hash_sys_bytes{job=~\"$job\", instance=~\"$instance\"})",
+ "format": "time_series",
+ "intervalFactor": 2,
+ "legendFormat": "profiling bucket hash table",
+ "refId": "C",
+ "step": 10
+ },
+ {
+ "expr": "sum(go_memstats_gc_sys_bytes{job=~\"$job\", instance=~\"$instance\"})",
+ "format": "time_series",
+ "intervalFactor": 2,
+ "legendFormat": "GC metadata",
+ "refId": "D",
+ "step": 10
+ },
+ {
+ "expr": "sum(go_memstats_heap_alloc_bytes{job=~\"$job\", instance=~\"$instance\"})",
+ "format": "time_series",
+ "intervalFactor": 2,
+ "legendFormat": "heap in-use",
+ "refId": "E",
+ "step": 10
+ },
+ {
+ "expr": "sum(go_memstats_heap_idle_bytes{job=~\"$job\", instance=~\"$instance\"})",
+ "format": "time_series",
+ "intervalFactor": 2,
+ "legendFormat": "heap idle",
+ "refId": "F",
+ "step": 10
+ },
+ {
+ "expr": "sum(go_memstats_heap_inuse_bytes{job=~\"$job\", instance=~\"$instance\"})",
+ "format": "time_series",
+ "intervalFactor": 2,
+ "legendFormat": "heap in use",
+ "refId": "G",
+ "step": 10
+ },
+ {
+ "expr": "sum(go_memstats_heap_released_bytes{job=~\"$job\", instance=~\"$instance\"})",
+ "format": "time_series",
+ "intervalFactor": 2,
+ "legendFormat": "heap released",
+ "refId": "H",
+ "step": 10
+ },
+ {
+ "expr": "sum(go_memstats_heap_sys_bytes{job=~\"$job\", instance=~\"$instance\"})",
+ "format": "time_series",
+ "intervalFactor": 2,
+ "legendFormat": "heap system",
+ "refId": "I",
+ "step": 10
+ },
+ {
+ "expr": "sum(go_memstats_mcache_inuse_bytes{job=~\"$job\", instance=~\"$instance\"})",
+ "format": "time_series",
+ "intervalFactor": 2,
+ "legendFormat": "mcache in use",
+ "refId": "J",
+ "step": 10
+ },
+ {
+ "expr": "sum(go_memstats_mcache_sys_bytes{job=~\"$job\", instance=~\"$instance\"})",
+ "format": "time_series",
+ "intervalFactor": 2,
+ "legendFormat": "mcache sys",
+ "refId": "K",
+ "step": 10
+ },
+ {
+ "expr": "sum(go_memstats_mspan_inuse_bytes{job=~\"$job\", instance=~\"$instance\"})",
+ "format": "time_series",
+ "intervalFactor": 2,
+ "legendFormat": "mspan in use",
+ "refId": "L",
+ "step": 10
+ },
+ {
+ "expr": "sum(go_memstats_mspan_sys_bytes{job=~\"$job\", instance=~\"$instance\"})",
+ "format": "time_series",
+ "intervalFactor": 2,
+ "legendFormat": "mspan sys",
+ "refId": "M",
+ "step": 10
+ },
+ {
+ "expr": "sum(go_memstats_next_gc_bytes{job=~\"$job\", instance=~\"$instance\"})",
+ "format": "time_series",
+ "intervalFactor": 2,
+ "legendFormat": "heap next gc",
+ "refId": "N",
+ "step": 10
+ },
+ {
+ "expr": "sum(go_memstats_other_sys_bytes{job=~\"$job\", instance=~\"$instance\"})",
+ "format": "time_series",
+ "intervalFactor": 2,
+ "legendFormat": "other sys",
+ "refId": "O",
+ "step": 10
+ },
+ {
+ "expr": "sum(go_memstats_stack_inuse_bytes{job=~\"$job\", instance=~\"$instance\"})",
+ "format": "time_series",
+ "intervalFactor": 2,
+ "legendFormat": "stack in use",
+ "refId": "P",
+ "step": 10
+ },
+ {
+ "expr": "sum(go_memstats_stack_sys_bytes{job=~\"$job\", instance=~\"$instance\"})",
+ "format": "time_series",
+ "intervalFactor": 2,
+ "legendFormat": "stack sys",
+ "refId": "Q",
+ "step": 10
+ },
+ {
+ "expr": "sum(go_memstats_sys_bytes{job=~\"$job\", instance=~\"$instance\"})",
+ "format": "time_series",
+ "intervalFactor": 2,
+ "legendFormat": "sys",
+ "refId": "R",
+ "step": 10
+ }
+ ],
+ "thresholds": [],
+ "timeFrom": null,
+ "timeShift": null,
+ "title": "Go Memory Usage (FIXME)",
+ "tooltip": {
+ "shared": true,
+ "sort": 0,
+ "value_type": "individual"
+ },
+ "type": "graph",
+ "xaxis": {
+ "buckets": null,
+ "mode": "time",
+ "name": null,
+ "show": true,
+ "values": []
+ },
+ "yaxes": [
+ {
+ "format": "bytes",
+ "label": null,
+ "logBase": 1,
+ "max": null,
+ "min": "0",
+ "show": true
+ },
+ {
+ "format": "short",
+ "label": null,
+ "logBase": 1,
+ "max": null,
+ "min": null,
+ "show": true
+ }
+ ]
+ },
+ {
+ "aliasColors": {},
+ "bars": false,
+ "dashLength": 10,
+ "dashes": false,
+ "datasource": "${DS_PROMETHEUS}",
+ "fill": 1,
+ "id": 9,
+ "legend": {
+ "avg": false,
+ "current": false,
+ "max": false,
+ "min": false,
+ "show": true,
+ "total": false,
+ "values": false
+ },
+ "lines": true,
+ "linewidth": 1,
+ "links": [],
+ "nullPointMode": "null",
+ "percentage": false,
+ "pointradius": 5,
+ "points": false,
+ "renderer": "flot",
+ "seriesOverrides": [],
+ "spaceLength": 10,
+ "span": 3,
+ "stack": false,
+ "steppedLine": false,
+ "targets": [
+ {
+ "expr": "prometheus_target_interval_length_seconds{instance=~\"$instance\", job=~\"$job\"}",
+ "format": "time_series",
+ "intervalFactor": 2,
+ "legendFormat": "{{quantile}} {{interval}}",
+ "refId": "A",
+ "step": 20
+ }
+ ],
+ "thresholds": [],
+ "timeFrom": null,
+ "timeShift": null,
+ "title": "Scrape Duration",
+ "tooltip": {
+ "shared": true,
+ "sort": 0,
+ "value_type": "individual"
+ },
+ "type": "graph",
+ "xaxis": {
+ "buckets": null,
+ "mode": "time",
+ "name": null,
+ "show": true,
+ "values": []
+ },
+ "yaxes": [
+ {
+ "format": "short",
+ "label": "Seconds",
+ "logBase": 1,
+ "max": null,
+ "min": "0",
+ "show": true
+ },
+ {
+ "format": "short",
+ "label": null,
+ "logBase": 1,
+ "max": null,
+ "min": null,
+ "show": true
+ }
+ ]
+ },
+ {
+ "aliasColors": {},
+ "bars": false,
+ "dashLength": 10,
+ "dashes": false,
+ "datasource": "${DS_PROMETHEUS}",
+ "fill": 1,
+ "id": 7,
+ "legend": {
+ "avg": false,
+ "current": false,
+ "max": false,
+ "min": false,
+ "show": true,
+ "total": false,
+ "values": false
+ },
+ "lines": true,
+ "linewidth": 1,
+ "links": [],
+ "nullPointMode": "null",
+ "percentage": false,
+ "pointradius": 5,
+ "points": false,
+ "renderer": "flot",
+ "seriesOverrides": [],
+ "spaceLength": 10,
+ "span": 3,
+ "stack": false,
+ "steppedLine": false,
+ "targets": [
+ {
+ "expr": "sum(rate(prometheus_target_interval_length_seconds_count{job=~\"$job\",instance=~\"$instance\"}[5m])) by (interval)",
+ "format": "time_series",
+ "intervalFactor": 2,
+ "legendFormat": "{{interval}}",
+ "refId": "A",
+ "step": 20
+ }
+ ],
+ "thresholds": [],
+ "timeFrom": null,
+ "timeShift": null,
+ "title": "Target Scrapes / 5m",
+ "tooltip": {
+ "shared": true,
+ "sort": 0,
+ "value_type": "individual"
+ },
+ "type": "graph",
+ "xaxis": {
+ "buckets": null,
+ "mode": "time",
+ "name": null,
+ "show": true,
+ "values": []
+ },
+ "yaxes": [
+ {
+ "format": "short",
+ "label": "Scrapes",
+ "logBase": 1,
+ "max": null,
+ "min": "0",
+ "show": true
+ },
+ {
+ "format": "short",
+ "label": null,
+ "logBase": 1,
+ "max": null,
+ "min": null,
+ "show": true
+ }
+ ]
+ }
+ ],
+ "repeat": null,
+ "repeatIteration": null,
+ "repeatRowId": null,
+ "showTitle": false,
+ "title": "Broken, ignore",
+ "titleSize": "h6"
+ }
+ ],
+ "schemaVersion": 14,
+ "style": "dark",
+ "tags": [],
+ "templating": {
+ "list": [
+ {
+ "current": {
+ "selected": false,
+ "text": "default",
+ "value": "default"
+ },
+ "hide": 0,
+ "includeAll": false,
+ "label": "datasource",
+ "multi": false,
+ "name": "DS_PROMETHEUS",
+ "options": [],
+ "query": "prometheus",
+ "refresh": 1,
+ "regex": "",
+ "skipUrlSync": false,
+ "type": "datasource"
+ },
+ {
+ "allValue": null,
+ "current": {},
+ "datasource": "${DS_PROMETHEUS}",
+ "hide": 0,
+ "includeAll": true,
+ "label": null,
+ "multi": true,
+ "name": "job",
+ "options": [],
+ "query": "query_result(prometheus_tsdb_head_samples_appended_total)",
+ "refresh": 2,
+ "regex": "/.*job=\"([^\"]+)/",
+ "sort": 1,
+ "tagValuesQuery": "",
+ "tags": [],
+ "tagsQuery": "",
+ "type": "query",
+ "useTags": false
+ },
+ {
+ "allValue": null,
+ "current": {},
+ "datasource": "${DS_PROMETHEUS}",
+ "hide": 0,
+ "includeAll": true,
+ "label": null,
+ "multi": true,
+ "name": "instance",
+ "options": [],
+ "query": "query_result(up{job=~\"$job\"})",
+ "refresh": 2,
+ "regex": "/.*instance=\"([^\"]+).*/",
+ "sort": 0,
+ "tagValuesQuery": "",
+ "tags": [],
+ "tagsQuery": "",
+ "type": "query",
+ "useTags": false
+ },
+ {
+ "allValue": null,
+ "current": {
+ "selected": true,
+ "text": "1h",
+ "value": "1h"
+ },
+ "hide": 0,
+ "includeAll": false,
+ "label": null,
+ "multi": false,
+ "name": "interval",
+ "options": [
+ {
+ "selected": true,
+ "text": "1h",
+ "value": "1h"
+ },
+ {
+ "selected": false,
+ "text": "3h",
+ "value": "3h"
+ },
+ {
+ "selected": false,
+ "text": "6h",
+ "value": "6h"
+ },
+ {
+ "selected": false,
+ "text": "12h",
+ "value": "12h"
+ },
+ {
+ "selected": false,
+ "text": "1d",
+ "value": "1d"
+ },
+ {
+ "selected": false,
+ "text": "2d",
+ "value": "2d"
+ },
+ {
+ "selected": false,
+ "text": "7d",
+ "value": "7d"
+ },
+ {
+ "selected": false,
+ "text": "30d",
+ "value": "30d"
+ },
+ {
+ "selected": false,
+ "text": "90d",
+ "value": "90d"
+ },
+ {
+ "selected": false,
+ "text": "180d",
+ "value": "180d"
+ }
+ ],
+ "query": "1h, 3h, 6h, 12h, 1d, 2d, 7d, 30d, 90d, 180d",
+ "type": "custom"
+ }
+ ]
+ },
+ "time": {
+ "from": "now-4h",
+ "to": "now"
+ },
+ "timepicker": {
+ "refresh_intervals": [
+ "5s",
+ "10s",
+ "30s",
+ "1m",
+ "5m",
+ "15m",
+ "30m",
+ "1h",
+ "2h",
+ "1d"
+ ],
+ "time_options": [
+ "5m",
+ "15m",
+ "1h",
+ "6h",
+ "12h",
+ "24h",
+ "2d",
+ "7d",
+ "30d"
+ ]
+ },
+ "timezone": "browser",
+ "title": "Prometheus",
+ "version": 1
+}
diff --git a/terraform-ci-infra/1n_nmd/grafana/main.tf b/terraform-ci-infra/1n_nmd/grafana/main.tf
new file mode 100644
index 0000000000..b67ba03985
--- /dev/null
+++ b/terraform-ci-infra/1n_nmd/grafana/main.tf
@@ -0,0 +1,24 @@
+locals {
+ datacenters = join(",", var.nomad_datacenters)
+}
+
+data "template_file" "nomad_job_grafana" {
+ template = file("${path.module}/conf/nomad/grafana.hcl")
+ vars = {
+ datacenters = local.datacenters
+ job_name = var.grafana_job_name
+ use_canary = var.grafana_use_canary
+ group_count = var.grafana_group_count
+ service_name = var.grafana_service_name
+ use_vault_provider = var.grafana_vault_secret.use_vault_provider
+ image = var.grafana_container_image
+ cpu = var.grafana_cpu
+ mem = var.grafana_mem
+ port = var.grafana_port
+ }
+}
+
+resource "nomad_job" "nomad_job_grafana" {
+ jobspec = data.template_file.nomad_job_grafana.rendered
+ detach = false
+} \ No newline at end of file
diff --git a/terraform-ci-infra/1n_nmd/grafana/variables.tf b/terraform-ci-infra/1n_nmd/grafana/variables.tf
new file mode 100644
index 0000000000..0c2382b16a
--- /dev/null
+++ b/terraform-ci-infra/1n_nmd/grafana/variables.tf
@@ -0,0 +1,66 @@
+# Nomad
+variable "nomad_datacenters" {
+ description = "Nomad data centers"
+ type = list(string)
+ default = [ "dc1" ]
+}
+
+# Grafana
+variable "grafana_job_name" {
+ description = "Grafana job name"
+ type = string
+ default = "grafana"
+}
+
+variable "grafana_group_count" {
+ description = "Number of grafana group instances"
+ type = number
+ default = 1
+}
+
+variable "grafana_service_name" {
+ description = "Grafana service name"
+ type = string
+ default = "grafana"
+}
+
+variable "grafana_container_image" {
+ description = "Grafana docker image"
+ type = string
+ default = "grafana/grafana:7.3.7"
+}
+
+variable "grafana_use_canary" {
+ description = "Uses canary deployment"
+ type = bool
+ default = false
+}
+
+variable "grafana_vault_secret" {
+ description = "Set of properties to be able to fetch secret from vault"
+ type = object({
+ use_vault_provider = bool,
+ vault_kv_policy_name = string,
+ vault_kv_path = string,
+ vault_kv_field_access_key = string,
+ vault_kv_field_secret_key = string
+ })
+}
+
+variable "grafana_cpu" {
+ description = "Grafana CPU allocation"
+ type = number
+ default = 2000
+}
+
+variable "grafana_mem" {
+ description = "Grafana RAM allocation"
+ type = number
+ default = 8192
+}
+
+variable "grafana_port" {
+ description = "Grafana TCP allocation"
+ type = number
+ default = 3000
+} \ No newline at end of file
diff --git a/terraform-ci-infra/1n_nmd/main.tf b/terraform-ci-infra/1n_nmd/main.tf
new file mode 100644
index 0000000000..98ae47d091
--- /dev/null
+++ b/terraform-ci-infra/1n_nmd/main.tf
@@ -0,0 +1,188 @@
+<<<<<<< HEAD (dae934 Infra: Remove Consul TLS on clients (Nomad conflict))
+=======
+# For convenience in simple configurations, a child module automatically
+# inherits default (un-aliased) provider configurations from its parent.
+# This means that explicit provider blocks appear only in the root module,
+# and downstream modules can simply declare resources for that provider
+# and have them automatically associated with the root provider
+# configurations.
+module "alertmanager" {
+ source = "./alertmanager"
+ providers = {
+ nomad = nomad.yul1
+ }
+
+ # nomad
+ nomad_datacenters = [ "yul1" ]
+
+ # alertmanager
+ alertmanager_job_name = "prod-alertmanager"
+ alertmanager_use_canary = true
+ alertmanager_group_count = 1
+ alertmanager_vault_secret = {
+ use_vault_provider = false,
+ vault_kv_policy_name = "kv-secret",
+ vault_kv_path = "secret/data/prometheus",
+ vault_kv_field_access_key = "access_key",
+ vault_kv_field_secret_key = "secret_key"
+ }
+ alertmanager_version = "0.21.0"
+ alertmanager_cpu = 1000
+ alertmanager_mem = 1024
+ alertmanager_port = 9093
+ alertmanager_slack_api_url = "https://hooks.slack.com/services/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"
+ alertmanager_slack_channel = "fdio-infra-monitoring"
+}
+
+module "exporter" {
+ source = "./exporter"
+ providers = {
+ nomad = nomad.yul1
+ }
+
+ # nomad
+ nomad_datacenters = [ "yul1" ]
+
+ # exporter
+ exporter_job_name = "prod-exporter"
+ exporter_use_canary = false
+
+ # node
+ node_version = "1.0.1"
+ node_port = 9100
+
+ # blackbox
+ blackbox_version = "0.18.0"
+ blackbox_port = 9115
+
+ # cadvisor
+ cadvisor_image = "gcr.io/cadvisor/cadvisor:latest"
+ cadvisor_port = 8080
+}
+
+module "grafana" {
+ source = "./grafana"
+ providers = {
+ nomad = nomad.yul1
+ }
+
+ # nomad
+ nomad_datacenters = [ "yul1" ]
+
+ # grafana
+ grafana_job_name = "prod-grafana"
+ grafana_use_canary = true
+ grafana_group_count = 1
+ grafana_vault_secret = {
+ use_vault_provider = false,
+ vault_kv_policy_name = "kv-secret",
+ vault_kv_path = "secret/data/prometheus",
+ vault_kv_field_access_key = "access_key",
+ vault_kv_field_secret_key = "secret_key"
+ }
+ grafana_container_image = "grafana/grafana:7.3.7"
+ grafana_cpu = 2000
+ grafana_mem = 2048
+ grafana_port = 3000
+}
+
+module "minio" {
+ source = "./minio"
+ providers = {
+ nomad = nomad.yul1
+ }
+
+ # nomad
+ nomad_datacenters = [ "yul1" ]
+ nomad_host_volume = "prod-volume-data1-1"
+
+ # minio
+ minio_job_name = "prod-minio"
+ minio_group_count = 4
+ minio_service_name = "storage"
+ minio_host = "http://10.32.8.1{4...7}"
+ minio_port = 9000
+ minio_container_image = "minio/minio:RELEASE.2020-12-03T05-49-24Z"
+ minio_vault_secret = {
+ use_vault_provider = false,
+ vault_kv_policy_name = "kv-secret",
+ vault_kv_path = "secret/data/minio",
+ vault_kv_field_access_key = "access_key",
+ vault_kv_field_secret_key = "secret_key"
+ }
+ minio_data_dir = "/data/"
+ minio_use_host_volume = true
+ minio_use_canary = true
+ minio_envs = [ "MINIO_BROWSER=\"off\"" ]
+
+ # minio client
+ mc_job_name = "prod-mc"
+ mc_container_image = "minio/mc:RELEASE.2020-12-10T01-26-17Z"
+ mc_extra_commands = [
+ "mc policy set public LOCALMINIO/logs.fd.io",
+ "mc policy set public LOCALMINIO/docs.fd.io",
+ "mc ilm add --expiry-days '180' LOCALMINIO/logs.fd.io",
+ "mc admin user add LOCALMINIO storage Storage1234",
+ "mc admin policy set LOCALMINIO writeonly user=storage"
+ ]
+ minio_buckets = [ "logs.fd.io", "docs.fd.io" ]
+}
+
+module "nginx" {
+ source = "./nginx"
+ providers = {
+ nomad = nomad.yul1
+ }
+
+ # nomad
+ nomad_datacenters = [ "yul1" ]
+
+ # nginx
+ nginx_job_name = "prod-nginx"
+}
+
+module "prometheus" {
+ source = "./prometheus"
+ providers = {
+ nomad = nomad.yul1
+ }
+
+ # nomad
+ nomad_datacenters = [ "yul1" ]
+ nomad_host_volume = "prod-volume-data1-1"
+
+ # prometheus
+ prometheus_job_name = "prod-prometheus"
+ prometheus_use_canary = true
+ prometheus_group_count = 4
+ prometheus_vault_secret = {
+ use_vault_provider = false,
+ vault_kv_policy_name = "kv-secret",
+ vault_kv_path = "secret/data/prometheus",
+ vault_kv_field_access_key = "access_key",
+ vault_kv_field_secret_key = "secret_key"
+ }
+ prometheus_data_dir = "/data/"
+ prometheus_use_host_volume = true
+ prometheus_version = "2.24.0"
+ prometheus_cpu = 2000
+ prometheus_mem = 8192
+ prometheus_port = 9090
+}
+
+module "vpp_device" {
+ source = "./vpp_device"
+ providers = {
+ nomad = nomad.yul1
+ }
+
+ # nomad
+ nomad_datacenters = [ "yul1" ]
+
+ # csit_shim
+ csit_shim_job_name = "prod-device-csit-shim"
+ csit_shim_group_count = "1"
+ csit_shim_cpu = "1000"
+ csit_shim_mem = "5000"
+}
+>>>>>>> CHANGE (a44eef Infra: Monitoring capability)
diff --git a/terraform-ci-infra/1n_nmd/minio/outputs.tf b/terraform-ci-infra/1n_nmd/minio/outputs.tf
new file mode 100644
index 0000000000..309cd3b9d0
--- /dev/null
+++ b/terraform-ci-infra/1n_nmd/minio/outputs.tf
@@ -0,0 +1,4 @@
+output "minio_service_name" {
+ description = "Minio service name"
+ value = data.template_file.nomad_job_minio.vars.service_name
+} \ No newline at end of file
diff --git a/terraform-ci-infra/1n_nmd/prometheus/conf/nomad/prometheus.hcl b/terraform-ci-infra/1n_nmd/prometheus/conf/nomad/prometheus.hcl
new file mode 100644
index 0000000000..2d74662f31
--- /dev/null
+++ b/terraform-ci-infra/1n_nmd/prometheus/conf/nomad/prometheus.hcl
@@ -0,0 +1,569 @@
+job "${job_name}" {
+ # The "region" parameter specifies the region in which to execute the job.
+ # If omitted, this inherits the default region name of "global".
+ # region = "global"
+ #
+ # The "datacenters" parameter specifies the list of datacenters which should
+ # be considered when placing this task. This must be provided.
+ datacenters = "${datacenters}"
+
+ # The "type" parameter controls the type of job, which impacts the scheduler's
+ # decision on placement. This configuration is optional and defaults to
+ # "service". For a full list of job types and their differences, please see
+ # the online documentation.
+ #
+ # For more information, please see the online documentation at:
+ #
+ # https://www.nomadproject.io/docs/jobspec/schedulers
+ #
+ type = "service"
+
+ update {
+ # The "max_parallel" parameter specifies the maximum number of updates to
+ # perform in parallel. In this case, this specifies to update a single task
+ # at a time.
+ max_parallel = 1
+
+ health_check = "checks"
+
+ # The "min_healthy_time" parameter specifies the minimum time the allocation
+ # must be in the healthy state before it is marked as healthy and unblocks
+ # further allocations from being updated.
+ min_healthy_time = "10s"
+
+ # The "healthy_deadline" parameter specifies the deadline in which the
+ # allocation must be marked as healthy after which the allocation is
+ # automatically transitioned to unhealthy. Transitioning to unhealthy will
+ # fail the deployment and potentially roll back the job if "auto_revert" is
+ # set to true.
+ healthy_deadline = "3m"
+
+ # The "progress_deadline" parameter specifies the deadline in which an
+ # allocation must be marked as healthy. The deadline begins when the first
+ # allocation for the deployment is created and is reset whenever an allocation
+ # as part of the deployment transitions to a healthy state. If no allocation
+ # transitions to the healthy state before the progress deadline, the
+ # deployment is marked as failed.
+ progress_deadline = "10m"
+
+%{ if use_canary }
+ # The "canary" parameter specifies that changes to the job that would result
+ # in destructive updates should create the specified number of canaries
+ # without stopping any previous allocations. Once the operator determines the
+ # canaries are healthy, they can be promoted which unblocks a rolling update
+ # of the remaining allocations at a rate of "max_parallel".
+ #
+ # Further, setting "canary" equal to the count of the task group allows
+ # blue/green deployments. When the job is updated, a full set of the new
+ # version is deployed and upon promotion the old version is stopped.
+ canary = 1
+
+ # Specifies if the job should auto-promote to the canary version when all
+ # canaries become healthy during a deployment. Defaults to false which means
+ # canaries must be manually updated with the nomad deployment promote
+ # command.
+ auto_promote = true
+
+ # The "auto_revert" parameter specifies if the job should auto-revert to the
+ # last stable job on deployment failure. A job is marked as stable if all the
+ # allocations as part of its deployment were marked healthy.
+ auto_revert = true
+%{ endif }
+ }
+
+ # The "group" stanza defines a series of tasks that should be co-located on
+ # the same Nomad client. Any task within a group will be placed on the same
+ # client.
+ #
+ # For more information and examples on the "group" stanza, please see
+ # the online documentation at:
+ #
+ # https://www.nomadproject.io/docs/job-specification/group
+ #
+ group "prod-group1-${service_name}" {
+ # The "count" parameter specifies the number of the task groups that should
+ # be running under this group. This value must be non-negative and defaults
+ # to 1.
+ count = ${group_count}
+
+ # The volume stanza allows the group to specify that it requires a given
+ # volume from the cluster.
+ #
+ # For more information and examples on the "volume" stanza, please see
+ # the online documentation at:
+ #
+ # https://www.nomadproject.io/docs/job-specification/volume
+ #
+ %{ if use_host_volume }
+ volume "prod-volume1-${service_name}" {
+ type = "host"
+ read_only = false
+ source = "${host_volume}"
+ }
+ %{ endif }
+
+ # The constraint allows restricting the set of eligible nodes. Constraints
+ # may filter on attributes or client metadata.
+ #
+ # For more information and examples on the "volume" stanza, please see
+ # the online documentation at:
+ #
+ # https://www.nomadproject.io/docs/job-specification/constraint
+ #
+ constraint {
+ attribute = "$${attr.cpu.arch}"
+ operator = "!="
+ value = "arm64"
+ }
+
+ # The "task" stanza creates an individual unit of work, such as a Docker
+ # container, web application, or batch processing.
+ #
+ # For more information and examples on the "task" stanza, please see
+ # the online documentation at:
+ #
+ # https://www.nomadproject.io/docs/job-specification/task
+ #
+ task "prod-task1-${service_name}" {
+ # The "driver" parameter specifies the task driver that should be used to
+ # run the task.
+ driver = "exec"
+
+ %{ if use_host_volume }
+ volume_mount {
+ volume = "prod-volume1-${service_name}"
+ destination = "${data_dir}"
+ read_only = false
+ }
+ %{ endif }
+
+ %{ if use_vault_provider }
+ vault {
+ policies = "${vault_kv_policy_name}"
+ }
+ %{ endif }
+
+ # The "config" stanza specifies the driver configuration, which is passed
+ # directly to the driver to start the task. The details of configurations
+ # are specific to each driver, so please see specific driver
+ # documentation for more information.
+ config {
+ command = "local/prometheus-${version}.linux-amd64/prometheus"
+ args = [
+ "--config.file=secrets/prometheus.yml",
+ "--storage.tsdb.path=${data_dir}prometheus/",
+ "--storage.tsdb.retention.time=15d"
+ ]
+ }
+
+ # The artifact stanza instructs Nomad to fetch and unpack a remote resource,
+ # such as a file, tarball, or binary. Nomad downloads artifacts using the
+ # popular go-getter library, which permits downloading artifacts from a
+ # variety of locations using a URL as the input source.
+ #
+ # For more information and examples on the "artifact" stanza, please see
+ # the online documentation at:
+ #
+ # https://www.nomadproject.io/docs/job-specification/artifact
+ #
+ artifact {
+ source = "${url}"
+ }
+
+ # The "template" stanza instructs Nomad to manage a template, such as
+ # a configuration file or script. This template can optionally pull data
+ # from Consul or Vault to populate runtime configuration data.
+ #
+ # For more information and examples on the "template" stanza, please see
+ # the online documentation at:
+ #
+ # https://www.nomadproject.io/docs/job-specification/template
+ #
+ template {
+ change_mode = "noop"
+ change_signal = "SIGINT"
+ destination = "secrets/alerts.yml"
+ left_delimiter = "{{{"
+ right_delimiter = "}}}"
+ data = <<EOH
+---
+groups:
+- name: "Consul"
+ rules:
+ - alert: ConsulServiceHealthcheckFailed
+ expr: consul_catalog_service_node_healthy == 0
+ for: 0m
+ labels:
+ severity: critical
+ annotations:
+ summary: "Consul service healthcheck failed (instance {{ $labels.instance }})."
+ description: "Service: `{{ $labels.service_name }}` Healthcheck: `{{ $labels.service_id }}`."
+ - alert: ConsulMissingMasterNode
+ expr: consul_raft_peers < 3
+ for: 0m
+ labels:
+ severity: critical
+ annotations:
+ summary: "Consul missing master node (instance {{ $labels.instance }})."
+ description: "Numbers of consul raft peers should be 3, in order to preserve quorum."
+ - alert: ConsulAgentUnhealthy
+ expr: consul_health_node_status{status="critical"} == 1
+ for: 0m
+ labels:
+ severity: critical
+ annotations:
+ summary: "Consul agent unhealthy (instance {{ $labels.instance }})."
+ description: "A Consul agent is down."
+- name: "Hosts"
+ rules:
+ - alert: NodeDown
+ expr: up == 0
+ for: 0m
+ labels:
+ severity: critical
+ annotations:
+ summary: "Prometheus target missing (instance {{ $labels.instance }})."
+ description: "A Prometheus target has disappeared. An exporter might be crashed."
+ - alert: HostHighCpuLoad
+ expr: 100 - (avg by(instance) (rate(node_cpu_seconds_total{mode="idle"}[2m])) * 100) > 80
+ for: 0m
+ labels:
+ severity: warning
+ annotations:
+ summary: "Host high CPU load (instance {{ $labels.instance }})."
+ description: "CPU load is > 80%."
+ - alert: HostOutOfMemory
+ expr: node_memory_MemAvailable_bytes / node_memory_MemTotal_bytes * 100 < 10
+ for: 2m
+ labels:
+ severity: warning
+ annotations:
+ summary: "Host out of memory (instance {{ $labels.instance }})."
+ description: "Node memory is filling up (< 10% left)."
+ - alert: HostOomKillDetected
+ expr: increase(node_vmstat_oom_kill[1m]) > 0
+ for: 0m
+ labels:
+ severity: warning
+ annotations:
+ summary: "Host OOM kill detected (instance {{ $labels.instance }})."
+ description: "OOM kill detected."
+ - alert: HostMemoryUnderMemoryPressure
+ expr: rate(node_vmstat_pgmajfault[1m]) > 1000
+ for: 2m
+ labels:
+ severity: warning
+ annotations:
+ summary: "Host memory under memory pressure (instance {{ $labels.instance }})."
+ description: "The node is under heavy memory pressure. High rate of major page faults."
+ - alert: HostOutOfDiskSpace
+ expr: (node_filesystem_avail_bytes * 100) / node_filesystem_size_bytes < 10 and ON (instance, device, mountpoint) node_filesystem_readonly == 0
+ for: 2m
+ labels:
+ severity: warning
+ annotations:
+ summary: "Host out of disk space (instance {{ $labels.instance }})."
+ description: "Disk is almost full (< 10% left)."
+ - alert: HostRaidDiskFailure
+ expr: node_md_disks{state="failed"} > 0
+ for: 2m
+ labels:
+ severity: warning
+ annotations:
+ summary: "Host RAID disk failure (instance {{ $labels.instance }})."
+ description: "At least one device in RAID array on {{ $labels.instance }} failed. Array {{ $labels.md_device }} needs attention and possibly a disk swap."
+ - alert: HostConntrackLimit
+ expr: node_nf_conntrack_entries / node_nf_conntrack_entries_limit > 0.8
+ for: 5m
+ labels:
+ severity: warning
+ annotations:
+ summary: "Host conntrack limit (instance {{ $labels.instance }})."
+ description: "The number of conntrack is approching limit."
+ - alert: HostNetworkInterfaceSaturated
+ expr: (rate(node_network_receive_bytes_total{device!~"^tap.*"}[1m]) + rate(node_network_transmit_bytes_total{device!~"^tap.*"}[1m])) / node_network_speed_bytes{device!~"^tap.*"} > 0.8
+ for: 1m
+ labels:
+ severity: warning
+ annotations:
+ summary: "Host Network Interface Saturated (instance {{ $labels.instance }})."
+ description: "The network interface {{ $labels.interface }} on {{ $labels.instance }} is getting overloaded."
+ - alert: HostSystemdServiceCrashed
+ expr: node_systemd_unit_state{state="failed"} == 1
+ for: 0m
+ labels:
+ severity: warning
+ annotations:
+ summary: "Host SystemD service crashed (instance {{ $labels.instance }})."
+ description: "SystemD service crashed."
+ - alert: HostEdacCorrectableErrorsDetected
+ expr: increase(node_edac_correctable_errors_total[1m]) > 0
+ for: 0m
+ labels:
+ severity: info
+ annotations:
+ summary: "Host EDAC Correctable Errors detected (instance {{ $labels.instance }})."
+ description: '{{ $labels.instance }} has had {{ printf "%.0f" $value }} correctable memory errors reported by EDAC in the last 5 minutes.'
+ - alert: HostEdacUncorrectableErrorsDetected
+ expr: node_edac_uncorrectable_errors_total > 0
+ for: 0m
+ labels:
+ severity: warning
+ annotations:
+ summary: "Host EDAC Uncorrectable Errors detected (instance {{ $labels.instance }})."
+ description: '{{ $labels.instance }} has had {{ printf "%.0f" $value }} uncorrectable memory errors reported by EDAC in the last 5 minutes.'
+- name: "Min.io"
+ rules:
+ - alert: MinioDiskOffline
+ expr: minio_offline_disks > 0
+ for: 0m
+ labels:
+ severity: critical
+ annotations:
+ summary: "Minio disk offline (instance {{ $labels.instance }})"
+ description: "Minio disk is offline."
+ - alert: MinioStorageSpaceExhausted
+ expr: minio_disk_storage_free_bytes / 1024 / 1024 / 1024 < 10
+ for: 2m
+ labels:
+ severity: warning
+ annotations:
+ summary: "Minio storage space exhausted (instance {{ $labels.instance }})."
+ description: "Minio storage space is low (< 10 GB)."
+- name: "Prometheus"
+ rules:
+ - alert: PrometheusConfigurationReloadFailure
+ expr: prometheus_config_last_reload_successful != 1
+ for: 0m
+ labels:
+ severity: warning
+ annotations:
+ summary: "Prometheus configuration reload failure (instance {{ $labels.instance }})."
+ description: "Prometheus configuration reload error."
+ - alert: PrometheusTooManyRestarts
+ expr: changes(process_start_time_seconds{job=~"prometheus|pushgateway|alertmanager"}[15m]) > 2
+ for: 0m
+ labels:
+ severity: warning
+ annotations:
+ summary: "Prometheus too many restarts (instance {{ $labels.instance }})."
+ description: "Prometheus has restarted more than twice in the last 15 minutes. It might be crashlooping."
+ - alert: PrometheusAlertmanagerConfigurationReloadFailure
+ expr: alertmanager_config_last_reload_successful != 1
+ for: 0m
+ labels:
+ severity: warning
+ annotations:
+ summary: "Prometheus AlertManager configuration reload failure (instance {{ $labels.instance }})."
+ description: "AlertManager configuration reload error."
+ - alert: PrometheusRuleEvaluationFailures
+ expr: increase(prometheus_rule_evaluation_failures_total[3m]) > 0
+ for: 0m
+ labels:
+ severity: critical
+ annotations:
+ summary: "Prometheus rule evaluation failures (instance {{ $labels.instance }})."
+ description: "Prometheus encountered {{ $value }} rule evaluation failures, leading to potentially ignored alerts."
+ - alert: PrometheusTargetScrapingSlow
+ expr: prometheus_target_interval_length_seconds{quantile="0.9"} > 60
+ for: 5m
+ labels:
+ severity: warning
+ annotations:
+ summary: "Prometheus target scraping slow (instance {{ $labels.instance }})."
+ description: "Prometheus is scraping exporters slowly."
+ - alert: PrometheusTsdbCompactionsFailed
+ expr: increase(prometheus_tsdb_compactions_failed_total[1m]) > 0
+ for: 0m
+ labels:
+ severity: critical
+ annotations:
+ summary: "Prometheus TSDB compactions failed (instance {{ $labels.instance }})."
+ description: "Prometheus encountered {{ $value }} TSDB compactions failures."
+ - alert: PrometheusTsdbHeadTruncationsFailed
+ expr: increase(prometheus_tsdb_head_truncations_failed_total[1m]) > 0
+ for: 0m
+ labels:
+ severity: critical
+ annotations:
+ summary: "Prometheus TSDB head truncations failed (instance {{ $labels.instance }})."
+ description: "Prometheus encountered {{ $value }} TSDB head truncation failures."
+ - alert: PrometheusTsdbWalCorruptions
+ expr: increase(prometheus_tsdb_wal_corruptions_total[1m]) > 0
+ for: 0m
+ labels:
+ severity: critical
+ annotations:
+ summary: "Prometheus TSDB WAL corruptions (instance {{ $labels.instance }})."
+ description: "Prometheus encountered {{ $value }} TSDB WAL corruptions."
+ - alert: PrometheusTsdbWalTruncationsFailed
+ expr: increase(prometheus_tsdb_wal_truncations_failed_total[1m]) > 0
+ for: 0m
+ labels:
+ severity: critical
+ annotations:
+ summary: "Prometheus TSDB WAL truncations failed (instance {{ $labels.instance }})."
+ description: "Prometheus encountered {{ $value }} TSDB WAL truncation failures."
+EOH
+ }
+
+ template {
+ change_mode = "noop"
+ change_signal = "SIGINT"
+ destination = "secrets/prometheus.yml"
+ data = <<EOH
+---
+global:
+ scrape_interval: 5s
+ scrape_timeout: 5s
+ evaluation_interval: 5s
+
+alerting:
+ alertmanagers:
+ - consul_sd_configs:
+ - server: '{{ env "NOMAD_IP_prometheus" }}:8500'
+ services: [ 'alertmanager' ]
+
+rule_files:
+ - 'alerts.yml'
+
+scrape_configs:
+
+ - job_name: 'Nomad Cluster'
+ consul_sd_configs:
+ - server: '{{ env "NOMAD_IP_prometheus" }}:8500'
+ services: [ 'nomad-client', 'nomad' ]
+ relabel_configs:
+ - source_labels: [__meta_consul_tags]
+ regex: '(.*)http(.*)'
+ action: keep
+ metrics_path: /v1/metrics
+ params:
+ format: [ 'prometheus' ]
+
+ - job_name: 'Consul Cluster'
+ static_configs:
+ - targets: [ '10.30.51.30:8500', '10.30.51.32:8500', '10.30.51.33:8500' ]
+ metrics_path: /v1/agent/metrics
+ params:
+ format: [ 'prometheus' ]
+
+ - job_name: 'Alertmanager'
+ consul_sd_configs:
+ - server: '{{ env "NOMAD_IP_prometheus" }}:8500'
+ services: [ 'alertmanager' ]
+
+ - job_name: 'Blackbox Exporter (icmp)'
+ static_configs:
+ - targets: [ 'gerrit.fd.io' ]
+ - targets: [ 'jenkins.fd.io' ]
+ - targets: [ '10.30.51.32' ]
+ params:
+ module: [ 'icmp_v4' ]
+ relabel_configs:
+ - source_labels: [__address__]
+ target_label: __param_target
+ - source_labels: [__param_target]
+ target_label: instance
+ - target_label: __address__
+ replacement: localhost:9115
+ metrics_path: /probe
+
+ - job_name: 'Blackbox Exporter (http)'
+ static_configs:
+ - targets: [ 'gerrit.fd.io' ]
+ - targets: [ 'jenkins.fd.io' ]
+ params:
+ module: [ 'http_2xx' ]
+ relabel_configs:
+ - source_labels: [__address__]
+ target_label: __param_target
+ - source_labels: [__param_target]
+ target_label: instance
+ - target_label: __address__
+ replacement: localhost:9115
+ metrics_path: /probe
+
+ - job_name: 'cAdvisor Exporter'
+ consul_sd_configs:
+ - server: '{{ env "NOMAD_IP_prometheus" }}:8500'
+ services: [ 'cadvisorexporter' ]
+
+ - job_name: 'Grafana'
+ consul_sd_configs:
+ - server: '{{ env "NOMAD_IP_prometheus" }}:8500'
+ services: [ 'grafana' ]
+
+ - job_name: 'Node Exporter'
+ consul_sd_configs:
+ - server: '{{ env "NOMAD_IP_prometheus" }}:8500'
+ services: [ 'nodeexporter' ]
+
+ - job_name: 'Prometheus'
+ consul_sd_configs:
+ - server: '{{ env "NOMAD_IP_prometheus" }}:8500'
+ services: [ 'prometheus' ]
+
+ - job_name: 'Minio'
+ bearer_token: eyJhbGciOiJIUzUxMiIsInR5cCI6IkpXVCJ9.eyJleHAiOjQ3NjQ1ODEzMzcsImlzcyI6InByb21ldGhldXMiLCJzdWIiOiJtaW5pbyJ9.oeTw3EIaiFmlDikrHXWiWXMH2vxLfDLkfjEC7G2N3M_keH_xyA_l2ofLLNYtopa_3GCEZnxLQdPuFZrmgpkDWg
+ consul_sd_configs:
+ - server: '{{ env "NOMAD_IP_prometheus" }}:8500'
+ services: [ 'storage' ]
+ metrics_path: /minio/prometheus/metrics
+EOH
+ }
+
+ # The service stanza instructs Nomad to register a service with Consul.
+ #
+ # For more information and examples on the "task" stanza, please see
+ # the online documentation at:
+ #
+ # https://www.nomadproject.io/docs/job-specification/service
+ #
+ service {
+ name = "${service_name}"
+ port = "${service_name}"
+ tags = [ "${service_name}$${NOMAD_ALLOC_INDEX}" ]
+ check {
+ name = "Prometheus Check Live"
+ type = "http"
+ path = "/-/healthy"
+ interval = "10s"
+ timeout = "2s"
+ }
+ }
+
+ # The "resources" stanza describes the requirements a task needs to
+ # execute. Resource requirements include memory, network, cpu, and more.
+ # This ensures the task will execute on a machine that contains enough
+ # resource capacity.
+ #
+ # For more information and examples on the "resources" stanza, please see
+ # the online documentation at:
+ #
+ # https://www.nomadproject.io/docs/job-specification/resources
+ #
+ resources {
+ cpu = ${cpu}
+ memory = ${mem}
+ # The network stanza specifies the networking requirements for the task
+ # group, including the network mode and port allocations. When scheduling
+ # jobs in Nomad they are provisioned across your fleet of machines along
+ # with other jobs and services. Because you don't know in advance what host
+ # your job will be provisioned on, Nomad will provide your tasks with
+ # network configuration when they start up.
+ #
+ # For more information and examples on the "template" stanza, please see
+ # the online documentation at:
+ #
+ # https://www.nomadproject.io/docs/job-specification/network
+ #
+ network {
+ port "${service_name}" {
+ static = ${port}
+ }
+ }
+ }
+ }
+ }
+} \ No newline at end of file
diff --git a/terraform-ci-infra/1n_nmd/prometheus/main.tf b/terraform-ci-infra/1n_nmd/prometheus/main.tf
new file mode 100644
index 0000000000..9506ba3941
--- /dev/null
+++ b/terraform-ci-infra/1n_nmd/prometheus/main.tf
@@ -0,0 +1,37 @@
+locals {
+ datacenters = join(",", var.nomad_datacenters)
+
+ prometheus_url = join("",
+ [
+ "https://github.com",
+ "/prometheus/prometheus/releases/download/",
+ "v${var.prometheus_version}/",
+ "prometheus-${var.prometheus_version}.linux-amd64.tar.gz"
+ ]
+ )
+}
+
+data "template_file" "nomad_job_prometheus" {
+ template = file("${path.module}/conf/nomad/prometheus.hcl")
+ vars = {
+ datacenters = local.datacenters
+ url = local.prometheus_url
+ job_name = var.prometheus_job_name
+ use_canary = var.prometheus_use_canary
+ group_count = var.prometheus_group_count
+ use_host_volume = var.prometheus_use_host_volume
+ host_volume = var.nomad_host_volume
+ data_dir = var.prometheus_data_dir
+ service_name = var.prometheus_service_name
+ use_vault_provider = var.prometheus_vault_secret.use_vault_provider
+ version = var.prometheus_version
+ cpu = var.prometheus_cpu
+ mem = var.prometheus_mem
+ port = var.prometheus_port
+ }
+}
+
+resource "nomad_job" "nomad_job_prometheus" {
+ jobspec = data.template_file.nomad_job_prometheus.rendered
+ detach = false
+} \ No newline at end of file
diff --git a/terraform-ci-infra/1n_nmd/prometheus/variables.tf b/terraform-ci-infra/1n_nmd/prometheus/variables.tf
new file mode 100644
index 0000000000..a509533ccd
--- /dev/null
+++ b/terraform-ci-infra/1n_nmd/prometheus/variables.tf
@@ -0,0 +1,84 @@
+# Nomad
+variable "nomad_datacenters" {
+ description = "Nomad data centers"
+ type = list(string)
+ default = [ "dc1" ]
+}
+
+variable "nomad_host_volume" {
+ description = "Nomad Host Volume"
+ type = string
+ default = "persistence"
+}
+
+# Prometheus
+variable "prometheus_job_name" {
+ description = "Prometheus job name"
+ type = string
+ default = "prometheus"
+}
+
+variable "prometheus_group_count" {
+ description = "Number of prometheus group instances"
+ type = number
+ default = 1
+}
+
+variable "prometheus_service_name" {
+ description = "Prometheus service name"
+ type = string
+ default = "prometheus"
+}
+
+variable "prometheus_version" {
+ description = "Prometheus version"
+ type = string
+ default = "v2.24.0"
+}
+
+variable "prometheus_use_canary" {
+ description = "Uses canary deployment"
+ type = bool
+ default = false
+}
+
+variable "prometheus_vault_secret" {
+ description = "Set of properties to be able to fetch secret from vault"
+ type = object({
+ use_vault_provider = bool,
+ vault_kv_policy_name = string,
+ vault_kv_path = string,
+ vault_kv_field_access_key = string,
+ vault_kv_field_secret_key = string
+ })
+}
+
+variable "prometheus_cpu" {
+ description = "Prometheus CPU allocation"
+ type = number
+ default = 2000
+}
+
+variable "prometheus_mem" {
+ description = "Prometheus RAM allocation"
+ type = number
+ default = 8192
+}
+
+variable "prometheus_port" {
+ description = "Prometheus TCP allocation"
+ type = number
+ default = 9200
+}
+
+variable "prometheus_data_dir" {
+ description = "Prometheus DISK allocation"
+ type = string
+ default = "/data"
+}
+
+variable "prometheus_use_host_volume" {
+ description = "Use Nomad host volume feature"
+ type = bool
+ default = false
+} \ No newline at end of file
diff --git a/terraform-ci-infra/1n_nmd/terraform.tfstate b/terraform-ci-infra/1n_nmd/terraform.tfstate
new file mode 100644
index 0000000000..c15e983e94
--- /dev/null
+++ b/terraform-ci-infra/1n_nmd/terraform.tfstate
@@ -0,0 +1,829 @@
+<<<<<<< HEAD (dae934 Infra: Remove Consul TLS on clients (Nomad conflict))
+=======
+{
+ "version": 4,
+ "terraform_version": "0.14.5",
+ "serial": 1030,
+ "lineage": "e4e7f30a-652d-7a31-e31c-5e3a3388c9b9",
+ "outputs": {},
+ "resources": [
+ {
+ "module": "module.alertmanager",
+ "mode": "data",
+ "type": "template_file",
+ "name": "nomad_job_alertmanager",
+ "provider": "provider[\"registry.terraform.io/hashicorp/template\"]",
+ "instances": [
+ {
+ "schema_version": 0,
+ "attributes": {
+ "filename": null,
+ "id": "ae09d392c808970a05f2423cd8dbf158f6cb6d3ae50260cd7d16f1575ce43871",
+ "rendered": "job \"prod-alertmanager\" {\n # The \"region\" parameter specifies the region in which to execute the job.\n # If omitted, this inherits the default region name of \"global\".\n # region = \"global\"\n #\n # The \"datacenters\" parameter specifies the list of datacenters which should\n # be considered when placing this task. This must be provided.\n datacenters = \"yul1\"\n\n # The \"type\" parameter controls the type of job, which impacts the scheduler's\n # decision on placement. This configuration is optional and defaults to\n # \"service\". For a full list of job types and their differences, please see\n # the online documentation.\n #\n # For more information, please see the online documentation at:\n #\n # https://www.nomadproject.io/docs/jobspec/schedulers\n #\n type = \"service\"\n\n update {\n # The \"max_parallel\" parameter specifies the maximum number of updates to\n # perform in parallel. In this case, this specifies to update a single task\n # at a time.\n max_parallel = 1\n\n health_check = \"checks\"\n\n # The \"min_healthy_time\" parameter specifies the minimum time the allocation\n # must be in the healthy state before it is marked as healthy and unblocks\n # further allocations from being updated.\n min_healthy_time = \"10s\"\n\n # The \"healthy_deadline\" parameter specifies the deadline in which the\n # allocation must be marked as healthy after which the allocation is\n # automatically transitioned to unhealthy. Transitioning to unhealthy will\n # fail the deployment and potentially roll back the job if \"auto_revert\" is\n # set to true.\n healthy_deadline = \"3m\"\n\n # The \"progress_deadline\" parameter specifies the deadline in which an\n # allocation must be marked as healthy. The deadline begins when the first\n # allocation for the deployment is created and is reset whenever an allocation\n # as part of the deployment transitions to a healthy state. If no allocation\n # transitions to the healthy state before the progress deadline, the\n # deployment is marked as failed.\n progress_deadline = \"10m\"\n\n\n # The \"canary\" parameter specifies that changes to the job that would result\n # in destructive updates should create the specified number of canaries\n # without stopping any previous allocations. Once the operator determines the\n # canaries are healthy, they can be promoted which unblocks a rolling update\n # of the remaining allocations at a rate of \"max_parallel\".\n #\n # Further, setting \"canary\" equal to the count of the task group allows\n # blue/green deployments. When the job is updated, a full set of the new\n # version is deployed and upon promotion the old version is stopped.\n canary = 1\n\n # Specifies if the job should auto-promote to the canary version when all\n # canaries become healthy during a deployment. Defaults to false which means\n # canaries must be manually updated with the nomad deployment promote\n # command.\n auto_promote = true\n\n # The \"auto_revert\" parameter specifies if the job should auto-revert to the\n # last stable job on deployment failure. A job is marked as stable if all the\n # allocations as part of its deployment were marked healthy.\n auto_revert = true\n\n }\n\n # The \"group\" stanza defines a series of tasks that should be co-located on\n # the same Nomad client. Any task within a group will be placed on the same\n # client.\n #\n # For more information and examples on the \"group\" stanza, please see\n # the online documentation at:\n #\n # https://www.nomadproject.io/docs/job-specification/group\n #\n group \"prod-group1-alertmanager\" {\n # The \"count\" parameter specifies the number of the task groups that should\n # be running under this group. This value must be non-negative and defaults\n # to 1.\n count = 1\n\n # The constraint allows restricting the set of eligible nodes. Constraints\n # may filter on attributes or client metadata.\n #\n # For more information and examples on the \"volume\" stanza, please see\n # the online documentation at:\n #\n # https://www.nomadproject.io/docs/job-specification/constraint\n #\n constraint {\n attribute = \"${attr.cpu.arch}\"\n operator = \"!=\"\n value = \"arm64\"\n }\n\n # The \"task\" stanza creates an individual unit of work, such as a Docker\n # container, web application, or batch processing.\n #\n # For more information and examples on the \"task\" stanza, please see\n # the online documentation at:\n #\n # https://www.nomadproject.io/docs/job-specification/task\n #\n task \"prod-task1-alertmanager\" {\n # The \"driver\" parameter specifies the task driver that should be used to\n # run the task.\n driver = \"exec\"\n\n \n\n # The \"config\" stanza specifies the driver configuration, which is passed\n # directly to the driver to start the task. The details of configurations\n # are specific to each driver, so please see specific driver\n # documentation for more information.\n config {\n command = \"local/alertmanager-0.21.0.linux-amd64/alertmanager\"\n args = [\n \"--config.file=secrets/alertmanager.yml\"\n ]\n }\n\n # The artifact stanza instructs Nomad to fetch and unpack a remote resource,\n # such as a file, tarball, or binary. Nomad downloads artifacts using the\n # popular go-getter library, which permits downloading artifacts from a\n # variety of locations using a URL as the input source.\n #\n # For more information and examples on the \"artifact\" stanza, please see\n # the online documentation at:\n #\n # https://www.nomadproject.io/docs/job-specification/artifact\n #\n artifact {\n source = \"https://github.com/prometheus/alertmanager/releases/download/v0.21.0/alertmanager-0.21.0.linux-amd64.tar.gz\"\n }\n\n # The \"template\" stanza instructs Nomad to manage a template, such as\n # a configuration file or script. This template can optionally pull data\n # from Consul or Vault to populate runtime configuration data.\n #\n # For more information and examples on the \"template\" stanza, please see\n # the online documentation at:\n #\n # https://www.nomadproject.io/docs/job-specification/template\n #\n template {\n change_mode = \"noop\"\n change_signal = \"SIGINT\"\n destination = \"secrets/alertmanager.yml\"\n left_delimiter = \"{{{\"\n right_delimiter = \"}}}\"\n data = \u003c\u003cEOH\nglobal:\n # The API URL to use for Slack notifications.\n slack_api_url: 'https://hooks.slack.com/services/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'\n\n# The directory from which notification templates are read.\ntemplates:\n- '/etc/alertmanager/template/*.tmpl'\n\n#tls_config:\n# # CA certificate to validate the server certificate with.\n# ca_file: \u003cfilepath\u003e ]\n#\n# # Certificate and key files for client cert authentication to the server.\n# cert_file: \u003cfilepath\u003e\n# key_file: \u003cfilepath\u003e\n#\n# # ServerName extension to indicate the name of the server.\n# # http://tools.ietf.org/html/rfc4366#section-3.1\n# server_name: \u003cstring\u003e\n#\n# # Disable validation of the server certificate.\n# insecure_skip_verify: true\n\n# The root route on which each incoming alert enters.\nroute:\n receiver: 'default-receiver'\n\n # The labels by which incoming alerts are grouped together. For example,\n # multiple alerts coming in for cluster=A and alertname=LatencyHigh would\n # be batched into a single group.\n #\n # To aggregate by all possible labels use '...' as the sole label name.\n # This effectively disables aggregation entirely, passing through all\n # alerts as-is. This is unlikely to be what you want, unless you have\n # a very low alert volume or your upstream notification system performs\n # its own grouping. Example: group_by: [...]\n group_by: ['alertname', 'cluster', 'service']\n\n # When a new group of alerts is created by an incoming alert, wait at\n # least 'group_wait' to send the initial notification.\n # This way ensures that you get multiple alerts for the same group that start\n # firing shortly after another are batched together on the first\n # notification.\n group_wait: 30s\n\n # When the first notification was sent, wait 'group_interval' to send a batch\n # of new alerts that started firing for that group.\n group_interval: 5m\n\n # If an alert has successfully been sent, wait 'repeat_interval' to\n # resend them.\n repeat_interval: 3h\n\n # All the above attributes are inherited by all child routes and can\n # overwritten on each.\n # The child route trees.\n routes:\n # This routes performs a regular expression match on alert labels to\n # catch alerts that are related to a list of services.\n - match_re:\n service: .*\n receiver: default-receiver\n # The service has a sub-route for critical alerts, any alerts\n # that do not match, i.e. severity != critical, fall-back to the\n # parent node and are sent to 'team-X-mails'\n routes:\n - match:\n severity: critical\n receiver: 'default-receiver'\n\n# Inhibition rules allow to mute a set of alerts given that another alert is\n# firing.\n# We use this to mute any warning-level notifications if the same alert is\n# already critical.\ninhibit_rules:\n- source_match:\n severity: 'critical'\n target_match:\n severity: 'warning'\n # Apply inhibition if the alertname is the same.\n # CAUTION:\n # If all label names listed in `equal` are missing\n # from both the source and target alerts,\n # the inhibition rule will apply!\n equal: ['alertname', 'cluster', 'service']\n\nreceivers:\n- name: 'default-receiver'\n slack_configs:\n - channel: '#fdio-infra-monitoring'\n send_resolved: true\n icon_url: https://avatars3.githubusercontent.com/u/3380462\n title: |-\n [{{ .Status | toUpper }}{{ if eq .Status \"firing\" }}:{{ .Alerts.Firing | len }}{{ end }}] {{ .CommonLabels.alertname }} for {{ .CommonLabels.job }}\n {{- if gt (len .CommonLabels) (len .GroupLabels) -}}\n {{\" \"}}(\n {{- with .CommonLabels.Remove .GroupLabels.Names }}\n {{- range $index, $label := .SortedPairs -}}\n {{ if $index }}, {{ end }}\n {{- $label.Name }}=\"{{ $label.Value -}}\"\n {{- end }}\n {{- end -}}\n )\n {{- end }}\n text: \u003e-\n {{ range .Alerts -}}\n *Alert:* {{ .Annotations.summary }}{{ if .Labels.severity }} - `{{ .Labels.severity }}`{{ end }}\n\n *Description:* {{ .Annotations.description }}\n\n *Details:*\n {{ range .Labels.SortedPairs }} • *{{ .Name }}:* `{{ .Value }}`\n {{ end }}\n {{ end }}\nEOH\n }\n\n # The service stanza instructs Nomad to register a service with Consul.\n #\n # For more information and examples on the \"task\" stanza, please see\n # the online documentation at:\n #\n # https://www.nomadproject.io/docs/job-specification/service\n #\n service {\n name = \"alertmanager\"\n port = \"alertmanager\"\n tags = [ \"alertmanager${NOMAD_ALLOC_INDEX}\" ]\n check {\n name = \"Alertmanager Check Live\"\n type = \"http\"\n path = \"/-/healthy\"\n interval = \"10s\"\n timeout = \"2s\"\n }\n }\n\n # The \"resources\" stanza describes the requirements a task needs to\n # execute. Resource requirements include memory, network, cpu, and more.\n # This ensures the task will execute on a machine that contains enough\n # resource capacity.\n #\n # For more information and examples on the \"resources\" stanza, please see\n # the online documentation at:\n #\n # https://www.nomadproject.io/docs/job-specification/resources\n #\n resources {\n cpu = 1000\n memory = 1024\n # The network stanza specifies the networking requirements for the task\n # group, including the network mode and port allocations. When scheduling\n # jobs in Nomad they are provisioned across your fleet of machines along\n # with other jobs and services. Because you don't know in advance what host\n # your job will be provisioned on, Nomad will provide your tasks with\n # network configuration when they start up.\n #\n # For more information and examples on the \"template\" stanza, please see\n # the online documentation at:\n #\n # https://www.nomadproject.io/docs/job-specification/network\n #\n network {\n port \"alertmanager\" {\n static = 9093\n }\n }\n }\n }\n }\n}",
+ "template": "job \"${job_name}\" {\n # The \"region\" parameter specifies the region in which to execute the job.\n # If omitted, this inherits the default region name of \"global\".\n # region = \"global\"\n #\n # The \"datacenters\" parameter specifies the list of datacenters which should\n # be considered when placing this task. This must be provided.\n datacenters = \"${datacenters}\"\n\n # The \"type\" parameter controls the type of job, which impacts the scheduler's\n # decision on placement. This configuration is optional and defaults to\n # \"service\". For a full list of job types and their differences, please see\n # the online documentation.\n #\n # For more information, please see the online documentation at:\n #\n # https://www.nomadproject.io/docs/jobspec/schedulers\n #\n type = \"service\"\n\n update {\n # The \"max_parallel\" parameter specifies the maximum number of updates to\n # perform in parallel. In this case, this specifies to update a single task\n # at a time.\n max_parallel = 1\n\n health_check = \"checks\"\n\n # The \"min_healthy_time\" parameter specifies the minimum time the allocation\n # must be in the healthy state before it is marked as healthy and unblocks\n # further allocations from being updated.\n min_healthy_time = \"10s\"\n\n # The \"healthy_deadline\" parameter specifies the deadline in which the\n # allocation must be marked as healthy after which the allocation is\n # automatically transitioned to unhealthy. Transitioning to unhealthy will\n # fail the deployment and potentially roll back the job if \"auto_revert\" is\n # set to true.\n healthy_deadline = \"3m\"\n\n # The \"progress_deadline\" parameter specifies the deadline in which an\n # allocation must be marked as healthy. The deadline begins when the first\n # allocation for the deployment is created and is reset whenever an allocation\n # as part of the deployment transitions to a healthy state. If no allocation\n # transitions to the healthy state before the progress deadline, the\n # deployment is marked as failed.\n progress_deadline = \"10m\"\n\n%{ if use_canary }\n # The \"canary\" parameter specifies that changes to the job that would result\n # in destructive updates should create the specified number of canaries\n # without stopping any previous allocations. Once the operator determines the\n # canaries are healthy, they can be promoted which unblocks a rolling update\n # of the remaining allocations at a rate of \"max_parallel\".\n #\n # Further, setting \"canary\" equal to the count of the task group allows\n # blue/green deployments. When the job is updated, a full set of the new\n # version is deployed and upon promotion the old version is stopped.\n canary = 1\n\n # Specifies if the job should auto-promote to the canary version when all\n # canaries become healthy during a deployment. Defaults to false which means\n # canaries must be manually updated with the nomad deployment promote\n # command.\n auto_promote = true\n\n # The \"auto_revert\" parameter specifies if the job should auto-revert to the\n # last stable job on deployment failure. A job is marked as stable if all the\n # allocations as part of its deployment were marked healthy.\n auto_revert = true\n%{ endif }\n }\n\n # The \"group\" stanza defines a series of tasks that should be co-located on\n # the same Nomad client. Any task within a group will be placed on the same\n # client.\n #\n # For more information and examples on the \"group\" stanza, please see\n # the online documentation at:\n #\n # https://www.nomadproject.io/docs/job-specification/group\n #\n group \"prod-group1-${service_name}\" {\n # The \"count\" parameter specifies the number of the task groups that should\n # be running under this group. This value must be non-negative and defaults\n # to 1.\n count = ${group_count}\n\n # The constraint allows restricting the set of eligible nodes. Constraints\n # may filter on attributes or client metadata.\n #\n # For more information and examples on the \"volume\" stanza, please see\n # the online documentation at:\n #\n # https://www.nomadproject.io/docs/job-specification/constraint\n #\n constraint {\n attribute = \"$${attr.cpu.arch}\"\n operator = \"!=\"\n value = \"arm64\"\n }\n\n # The \"task\" stanza creates an individual unit of work, such as a Docker\n # container, web application, or batch processing.\n #\n # For more information and examples on the \"task\" stanza, please see\n # the online documentation at:\n #\n # https://www.nomadproject.io/docs/job-specification/task\n #\n task \"prod-task1-${service_name}\" {\n # The \"driver\" parameter specifies the task driver that should be used to\n # run the task.\n driver = \"exec\"\n\n %{ if use_vault_provider }\n vault {\n policies = \"${vault_kv_policy_name}\"\n }\n %{ endif }\n\n # The \"config\" stanza specifies the driver configuration, which is passed\n # directly to the driver to start the task. The details of configurations\n # are specific to each driver, so please see specific driver\n # documentation for more information.\n config {\n command = \"local/alertmanager-${version}.linux-amd64/alertmanager\"\n args = [\n \"--config.file=secrets/alertmanager.yml\"\n ]\n }\n\n # The artifact stanza instructs Nomad to fetch and unpack a remote resource,\n # such as a file, tarball, or binary. Nomad downloads artifacts using the\n # popular go-getter library, which permits downloading artifacts from a\n # variety of locations using a URL as the input source.\n #\n # For more information and examples on the \"artifact\" stanza, please see\n # the online documentation at:\n #\n # https://www.nomadproject.io/docs/job-specification/artifact\n #\n artifact {\n source = \"${url}\"\n }\n\n # The \"template\" stanza instructs Nomad to manage a template, such as\n # a configuration file or script. This template can optionally pull data\n # from Consul or Vault to populate runtime configuration data.\n #\n # For more information and examples on the \"template\" stanza, please see\n # the online documentation at:\n #\n # https://www.nomadproject.io/docs/job-specification/template\n #\n template {\n change_mode = \"noop\"\n change_signal = \"SIGINT\"\n destination = \"secrets/alertmanager.yml\"\n left_delimiter = \"{{{\"\n right_delimiter = \"}}}\"\n data = \u003c\u003cEOH\nglobal:\n # The API URL to use for Slack notifications.\n slack_api_url: '${slack_api_url}'\n\n# The directory from which notification templates are read.\ntemplates:\n- '/etc/alertmanager/template/*.tmpl'\n\n#tls_config:\n# # CA certificate to validate the server certificate with.\n# ca_file: \u003cfilepath\u003e ]\n#\n# # Certificate and key files for client cert authentication to the server.\n# cert_file: \u003cfilepath\u003e\n# key_file: \u003cfilepath\u003e\n#\n# # ServerName extension to indicate the name of the server.\n# # http://tools.ietf.org/html/rfc4366#section-3.1\n# server_name: \u003cstring\u003e\n#\n# # Disable validation of the server certificate.\n# insecure_skip_verify: true\n\n# The root route on which each incoming alert enters.\nroute:\n receiver: '${default_receiver}'\n\n # The labels by which incoming alerts are grouped together. For example,\n # multiple alerts coming in for cluster=A and alertname=LatencyHigh would\n # be batched into a single group.\n #\n # To aggregate by all possible labels use '...' as the sole label name.\n # This effectively disables aggregation entirely, passing through all\n # alerts as-is. This is unlikely to be what you want, unless you have\n # a very low alert volume or your upstream notification system performs\n # its own grouping. Example: group_by: [...]\n group_by: ['alertname', 'cluster', 'service']\n\n # When a new group of alerts is created by an incoming alert, wait at\n # least 'group_wait' to send the initial notification.\n # This way ensures that you get multiple alerts for the same group that start\n # firing shortly after another are batched together on the first\n # notification.\n group_wait: 30s\n\n # When the first notification was sent, wait 'group_interval' to send a batch\n # of new alerts that started firing for that group.\n group_interval: 5m\n\n # If an alert has successfully been sent, wait 'repeat_interval' to\n # resend them.\n repeat_interval: 3h\n\n # All the above attributes are inherited by all child routes and can\n # overwritten on each.\n # The child route trees.\n routes:\n # This routes performs a regular expression match on alert labels to\n # catch alerts that are related to a list of services.\n - match_re:\n service: .*\n receiver: ${default_receiver}\n # The service has a sub-route for critical alerts, any alerts\n # that do not match, i.e. severity != critical, fall-back to the\n # parent node and are sent to 'team-X-mails'\n routes:\n - match:\n severity: critical\n receiver: '${default_receiver}'\n\n# Inhibition rules allow to mute a set of alerts given that another alert is\n# firing.\n# We use this to mute any warning-level notifications if the same alert is\n# already critical.\ninhibit_rules:\n- source_match:\n severity: 'critical'\n target_match:\n severity: 'warning'\n # Apply inhibition if the alertname is the same.\n # CAUTION:\n # If all label names listed in `equal` are missing\n # from both the source and target alerts,\n # the inhibition rule will apply!\n equal: ['alertname', 'cluster', 'service']\n\nreceivers:\n- name: '${default_receiver}'\n slack_configs:\n - channel: '#${slack_channel}'\n send_resolved: true\n icon_url: https://avatars3.githubusercontent.com/u/3380462\n title: |-\n [{{ .Status | toUpper }}{{ if eq .Status \"firing\" }}:{{ .Alerts.Firing | len }}{{ end }}] {{ .CommonLabels.alertname }} for {{ .CommonLabels.job }}\n {{- if gt (len .CommonLabels) (len .GroupLabels) -}}\n {{\" \"}}(\n {{- with .CommonLabels.Remove .GroupLabels.Names }}\n {{- range $index, $label := .SortedPairs -}}\n {{ if $index }}, {{ end }}\n {{- $label.Name }}=\"{{ $label.Value -}}\"\n {{- end }}\n {{- end -}}\n )\n {{- end }}\n text: \u003e-\n {{ range .Alerts -}}\n *Alert:* {{ .Annotations.summary }}{{ if .Labels.severity }} - `{{ .Labels.severity }}`{{ end }}\n\n *Description:* {{ .Annotations.description }}\n\n *Details:*\n {{ range .Labels.SortedPairs }} • *{{ .Name }}:* `{{ .Value }}`\n {{ end }}\n {{ end }}\nEOH\n }\n\n # The service stanza instructs Nomad to register a service with Consul.\n #\n # For more information and examples on the \"task\" stanza, please see\n # the online documentation at:\n #\n # https://www.nomadproject.io/docs/job-specification/service\n #\n service {\n name = \"${service_name}\"\n port = \"${service_name}\"\n tags = [ \"${service_name}$${NOMAD_ALLOC_INDEX}\" ]\n check {\n name = \"Alertmanager Check Live\"\n type = \"http\"\n path = \"/-/healthy\"\n interval = \"10s\"\n timeout = \"2s\"\n }\n }\n\n # The \"resources\" stanza describes the requirements a task needs to\n # execute. Resource requirements include memory, network, cpu, and more.\n # This ensures the task will execute on a machine that contains enough\n # resource capacity.\n #\n # For more information and examples on the \"resources\" stanza, please see\n # the online documentation at:\n #\n # https://www.nomadproject.io/docs/job-specification/resources\n #\n resources {\n cpu = ${cpu}\n memory = ${mem}\n # The network stanza specifies the networking requirements for the task\n # group, including the network mode and port allocations. When scheduling\n # jobs in Nomad they are provisioned across your fleet of machines along\n # with other jobs and services. Because you don't know in advance what host\n # your job will be provisioned on, Nomad will provide your tasks with\n # network configuration when they start up.\n #\n # For more information and examples on the \"template\" stanza, please see\n # the online documentation at:\n #\n # https://www.nomadproject.io/docs/job-specification/network\n #\n network {\n port \"${service_name}\" {\n static = ${port}\n }\n }\n }\n }\n }\n}",
+ "vars": {
+ "cpu": "1000",
+ "datacenters": "yul1",
+ "default_receiver": "default-receiver",
+ "group_count": "1",
+ "job_name": "prod-alertmanager",
+ "mem": "1024",
+ "port": "9093",
+ "service_name": "alertmanager",
+ "slack_api_url": "https://hooks.slack.com/services/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",
+ "slack_channel": "fdio-infra-monitoring",
+ "url": "https://github.com/prometheus/alertmanager/releases/download/v0.21.0/alertmanager-0.21.0.linux-amd64.tar.gz",
+ "use_canary": "true",
+ "use_vault_provider": "false",
+ "version": "0.21.0"
+ }
+ },
+ "sensitive_attributes": []
+ }
+ ]
+ },
+ {
+ "module": "module.alertmanager",
+ "mode": "managed",
+ "type": "nomad_job",
+ "name": "nomad_job_alertmanager",
+ "provider": "provider[\"registry.terraform.io/hashicorp/nomad\"].yul1",
+ "instances": [
+ {
+ "schema_version": 0,
+ "attributes": {
+ "allocation_ids": [
+ "464144d9-8c27-dc03-71bb-6f8d525bc486"
+ ],
+ "datacenters": [
+ "yul1"
+ ],
+ "deployment_id": "d69d73a6-2a1b-ebf6-e421-30ce1f34f519",
+ "deployment_status": "successful",
+ "deregister_on_destroy": true,
+ "deregister_on_id_change": true,
+ "detach": false,
+ "id": "prod-alertmanager",
+ "jobspec": "job \"prod-alertmanager\" {\n # The \"region\" parameter specifies the region in which to execute the job.\n # If omitted, this inherits the default region name of \"global\".\n # region = \"global\"\n #\n # The \"datacenters\" parameter specifies the list of datacenters which should\n # be considered when placing this task. This must be provided.\n datacenters = \"yul1\"\n\n # The \"type\" parameter controls the type of job, which impacts the scheduler's\n # decision on placement. This configuration is optional and defaults to\n # \"service\". For a full list of job types and their differences, please see\n # the online documentation.\n #\n # For more information, please see the online documentation at:\n #\n # https://www.nomadproject.io/docs/jobspec/schedulers\n #\n type = \"service\"\n\n update {\n # The \"max_parallel\" parameter specifies the maximum number of updates to\n # perform in parallel. In this case, this specifies to update a single task\n # at a time.\n max_parallel = 1\n\n health_check = \"checks\"\n\n # The \"min_healthy_time\" parameter specifies the minimum time the allocation\n # must be in the healthy state before it is marked as healthy and unblocks\n # further allocations from being updated.\n min_healthy_time = \"10s\"\n\n # The \"healthy_deadline\" parameter specifies the deadline in which the\n # allocation must be marked as healthy after which the allocation is\n # automatically transitioned to unhealthy. Transitioning to unhealthy will\n # fail the deployment and potentially roll back the job if \"auto_revert\" is\n # set to true.\n healthy_deadline = \"3m\"\n\n # The \"progress_deadline\" parameter specifies the deadline in which an\n # allocation must be marked as healthy. The deadline begins when the first\n # allocation for the deployment is created and is reset whenever an allocation\n # as part of the deployment transitions to a healthy state. If no allocation\n # transitions to the healthy state before the progress deadline, the\n # deployment is marked as failed.\n progress_deadline = \"10m\"\n\n\n # The \"canary\" parameter specifies that changes to the job that would result\n # in destructive updates should create the specified number of canaries\n # without stopping any previous allocations. Once the operator determines the\n # canaries are healthy, they can be promoted which unblocks a rolling update\n # of the remaining allocations at a rate of \"max_parallel\".\n #\n # Further, setting \"canary\" equal to the count of the task group allows\n # blue/green deployments. When the job is updated, a full set of the new\n # version is deployed and upon promotion the old version is stopped.\n canary = 1\n\n # Specifies if the job should auto-promote to the canary version when all\n # canaries become healthy during a deployment. Defaults to false which means\n # canaries must be manually updated with the nomad deployment promote\n # command.\n auto_promote = true\n\n # The \"auto_revert\" parameter specifies if the job should auto-revert to the\n # last stable job on deployment failure. A job is marked as stable if all the\n # allocations as part of its deployment were marked healthy.\n auto_revert = true\n\n }\n\n # The \"group\" stanza defines a series of tasks that should be co-located on\n # the same Nomad client. Any task within a group will be placed on the same\n # client.\n #\n # For more information and examples on the \"group\" stanza, please see\n # the online documentation at:\n #\n # https://www.nomadproject.io/docs/job-specification/group\n #\n group \"prod-group1-alertmanager\" {\n # The \"count\" parameter specifies the number of the task groups that should\n # be running under this group. This value must be non-negative and defaults\n # to 1.\n count = 1\n\n # The constraint allows restricting the set of eligible nodes. Constraints\n # may filter on attributes or client metadata.\n #\n # For more information and examples on the \"volume\" stanza, please see\n # the online documentation at:\n #\n # https://www.nomadproject.io/docs/job-specification/constraint\n #\n constraint {\n attribute = \"${attr.cpu.arch}\"\n operator = \"!=\"\n value = \"arm64\"\n }\n\n # The \"task\" stanza creates an individual unit of work, such as a Docker\n # container, web application, or batch processing.\n #\n # For more information and examples on the \"task\" stanza, please see\n # the online documentation at:\n #\n # https://www.nomadproject.io/docs/job-specification/task\n #\n task \"prod-task1-alertmanager\" {\n # The \"driver\" parameter specifies the task driver that should be used to\n # run the task.\n driver = \"exec\"\n\n \n\n # The \"config\" stanza specifies the driver configuration, which is passed\n # directly to the driver to start the task. The details of configurations\n # are specific to each driver, so please see specific driver\n # documentation for more information.\n config {\n command = \"local/alertmanager-0.21.0.linux-amd64/alertmanager\"\n args = [\n \"--config.file=secrets/alertmanager.yml\"\n ]\n }\n\n # The artifact stanza instructs Nomad to fetch and unpack a remote resource,\n # such as a file, tarball, or binary. Nomad downloads artifacts using the\n # popular go-getter library, which permits downloading artifacts from a\n # variety of locations using a URL as the input source.\n #\n # For more information and examples on the \"artifact\" stanza, please see\n # the online documentation at:\n #\n # https://www.nomadproject.io/docs/job-specification/artifact\n #\n artifact {\n source = \"https://github.com/prometheus/alertmanager/releases/download/v0.21.0/alertmanager-0.21.0.linux-amd64.tar.gz\"\n }\n\n # The \"template\" stanza instructs Nomad to manage a template, such as\n # a configuration file or script. This template can optionally pull data\n # from Consul or Vault to populate runtime configuration data.\n #\n # For more information and examples on the \"template\" stanza, please see\n # the online documentation at:\n #\n # https://www.nomadproject.io/docs/job-specification/template\n #\n template {\n change_mode = \"noop\"\n change_signal = \"SIGINT\"\n destination = \"secrets/alertmanager.yml\"\n left_delimiter = \"{{{\"\n right_delimiter = \"}}}\"\n data = \u003c\u003cEOH\nglobal:\n # The API URL to use for Slack notifications.\n slack_api_url: 'https://hooks.slack.com/services/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'\n\n# The directory from which notification templates are read.\ntemplates:\n- '/etc/alertmanager/template/*.tmpl'\n\n#tls_config:\n# # CA certificate to validate the server certificate with.\n# ca_file: \u003cfilepath\u003e ]\n#\n# # Certificate and key files for client cert authentication to the server.\n# cert_file: \u003cfilepath\u003e\n# key_file: \u003cfilepath\u003e\n#\n# # ServerName extension to indicate the name of the server.\n# # http://tools.ietf.org/html/rfc4366#section-3.1\n# server_name: \u003cstring\u003e\n#\n# # Disable validation of the server certificate.\n# insecure_skip_verify: true\n\n# The root route on which each incoming alert enters.\nroute:\n receiver: 'default-receiver'\n\n # The labels by which incoming alerts are grouped together. For example,\n # multiple alerts coming in for cluster=A and alertname=LatencyHigh would\n # be batched into a single group.\n #\n # To aggregate by all possible labels use '...' as the sole label name.\n # This effectively disables aggregation entirely, passing through all\n # alerts as-is. This is unlikely to be what you want, unless you have\n # a very low alert volume or your upstream notification system performs\n # its own grouping. Example: group_by: [...]\n group_by: ['alertname', 'cluster', 'service']\n\n # When a new group of alerts is created by an incoming alert, wait at\n # least 'group_wait' to send the initial notification.\n # This way ensures that you get multiple alerts for the same group that start\n # firing shortly after another are batched together on the first\n # notification.\n group_wait: 30s\n\n # When the first notification was sent, wait 'group_interval' to send a batch\n # of new alerts that started firing for that group.\n group_interval: 5m\n\n # If an alert has successfully been sent, wait 'repeat_interval' to\n # resend them.\n repeat_interval: 3h\n\n # All the above attributes are inherited by all child routes and can\n # overwritten on each.\n # The child route trees.\n routes:\n # This routes performs a regular expression match on alert labels to\n # catch alerts that are related to a list of services.\n - match_re:\n service: .*\n receiver: default-receiver\n # The service has a sub-route for critical alerts, any alerts\n # that do not match, i.e. severity != critical, fall-back to the\n # parent node and are sent to 'team-X-mails'\n routes:\n - match:\n severity: critical\n receiver: 'default-receiver'\n\n# Inhibition rules allow to mute a set of alerts given that another alert is\n# firing.\n# We use this to mute any warning-level notifications if the same alert is\n# already critical.\ninhibit_rules:\n- source_match:\n severity: 'critical'\n target_match:\n severity: 'warning'\n # Apply inhibition if the alertname is the same.\n # CAUTION:\n # If all label names listed in `equal` are missing\n # from both the source and target alerts,\n # the inhibition rule will apply!\n equal: ['alertname', 'cluster', 'service']\n\nreceivers:\n- name: 'default-receiver'\n slack_configs:\n - channel: '#fdio-infra-monitoring'\n send_resolved: true\n icon_url: https://avatars3.githubusercontent.com/u/3380462\n title: |-\n [{{ .Status | toUpper }}{{ if eq .Status \"firing\" }}:{{ .Alerts.Firing | len }}{{ end }}] {{ .CommonLabels.alertname }} for {{ .CommonLabels.job }}\n {{- if gt (len .CommonLabels) (len .GroupLabels) -}}\n {{\" \"}}(\n {{- with .CommonLabels.Remove .GroupLabels.Names }}\n {{- range $index, $label := .SortedPairs -}}\n {{ if $index }}, {{ end }}\n {{- $label.Name }}=\"{{ $label.Value -}}\"\n {{- end }}\n {{- end -}}\n )\n {{- end }}\n text: \u003e-\n {{ range .Alerts -}}\n *Alert:* {{ .Annotations.summary }}{{ if .Labels.severity }} - `{{ .Labels.severity }}`{{ end }}\n\n *Description:* {{ .Annotations.description }}\n\n *Details:*\n {{ range .Labels.SortedPairs }} • *{{ .Name }}:* `{{ .Value }}`\n {{ end }}\n {{ end }}\nEOH\n }\n\n # The service stanza instructs Nomad to register a service with Consul.\n #\n # For more information and examples on the \"task\" stanza, please see\n # the online documentation at:\n #\n # https://www.nomadproject.io/docs/job-specification/service\n #\n service {\n name = \"alertmanager\"\n port = \"alertmanager\"\n tags = [ \"alertmanager${NOMAD_ALLOC_INDEX}\" ]\n check {\n name = \"Alertmanager Check Live\"\n type = \"http\"\n path = \"/-/healthy\"\n interval = \"10s\"\n timeout = \"2s\"\n }\n }\n\n # The \"resources\" stanza describes the requirements a task needs to\n # execute. Resource requirements include memory, network, cpu, and more.\n # This ensures the task will execute on a machine that contains enough\n # resource capacity.\n #\n # For more information and examples on the \"resources\" stanza, please see\n # the online documentation at:\n #\n # https://www.nomadproject.io/docs/job-specification/resources\n #\n resources {\n cpu = 1000\n memory = 1024\n # The network stanza specifies the networking requirements for the task\n # group, including the network mode and port allocations. When scheduling\n # jobs in Nomad they are provisioned across your fleet of machines along\n # with other jobs and services. Because you don't know in advance what host\n # your job will be provisioned on, Nomad will provide your tasks with\n # network configuration when they start up.\n #\n # For more information and examples on the \"template\" stanza, please see\n # the online documentation at:\n #\n # https://www.nomadproject.io/docs/job-specification/network\n #\n network {\n port \"alertmanager\" {\n static = 9093\n }\n }\n }\n }\n }\n}",
+ "json": null,
+ "modify_index": "7147924",
+ "name": "prod-alertmanager",
+ "namespace": "default",
+ "policy_override": null,
+ "purge_on_destroy": null,
+ "region": "global",
+ "task_groups": [
+ {
+ "count": 1,
+ "meta": {},
+ "name": "prod-group1-alertmanager",
+ "task": [
+ {
+ "driver": "exec",
+ "meta": {},
+ "name": "prod-task1-alertmanager",
+ "volume_mounts": []
+ }
+ ],
+ "volumes": []
+ }
+ ],
+ "type": "service"
+ },
+ "sensitive_attributes": [],
+ "private": "bnVsbA==",
+ "dependencies": [
+ "module.alertmanager.data.template_file.nomad_job_alertmanager"
+ ]
+ }
+ ]
+ },
+ {
+ "module": "module.exporter",
+ "mode": "data",
+ "type": "template_file",
+ "name": "nomad_job_exporter",
+ "provider": "provider[\"registry.terraform.io/hashicorp/template\"]",
+ "instances": [
+ {
+ "schema_version": 0,
+ "attributes": {
+ "filename": null,
+ "id": "28db119a83617686c94496ed114455041d4fc500d93c4de8f8b8cfbe7d4c1db2",
+ "rendered": "job \"prod-exporter\" {\n # The \"region\" parameter specifies the region in which to execute the job.\n # If omitted, this inherits the default region name of \"global\".\n # region = \"global\"\n #\n # The \"datacenters\" parameter specifies the list of datacenters which should\n # be considered when placing this task. This must be provided.\n datacenters = \"yul1\"\n\n # The \"type\" parameter controls the type of job, which impacts the scheduler's\n # decision on placement. This configuration is optional and defaults to\n # \"service\". For a full list of job types and their differences, please see\n # the online documentation.\n #\n # https://www.nomadproject.io/docs/jobspec/schedulers\n #\n type = \"system\"\n\n update {\n # The \"max_parallel\" parameter specifies the maximum number of updates to\n # perform in parallel. In this case, this specifies to update a single task\n # at a time.\n max_parallel = 1\n\n health_check = \"checks\"\n\n # The \"min_healthy_time\" parameter specifies the minimum time the allocation\n # must be in the healthy state before it is marked as healthy and unblocks\n # further allocations from being updated.\n min_healthy_time = \"10s\"\n\n # The \"healthy_deadline\" parameter specifies the deadline in which the\n # allocation must be marked as healthy after which the allocation is\n # automatically transitioned to unhealthy. Transitioning to unhealthy will\n # fail the deployment and potentially roll back the job if \"auto_revert\" is\n # set to true.\n healthy_deadline = \"3m\"\n\n # The \"progress_deadline\" parameter specifies the deadline in which an\n # allocation must be marked as healthy. The deadline begins when the first\n # allocation for the deployment is created and is reset whenever an allocation\n # as part of the deployment transitions to a healthy state. If no allocation\n # transitions to the healthy state before the progress deadline, the\n # deployment is marked as failed.\n progress_deadline = \"10m\"\n\n\n }\n\n # The \"group\" stanza defines a series of tasks that should be co-located on\n # the same Nomad client. Any task within a group will be placed on the same\n # client.\n #\n # https://www.nomadproject.io/docs/job-specification/group\n #\n group \"prod-group1-exporter-amd64\" {\n # The constraint allows restricting the set of eligible nodes. Constraints\n # may filter on attributes or client metadata.\n #\n # https://www.nomadproject.io/docs/job-specification/constraint\n #\n constraint {\n attribute = \"${attr.cpu.arch}\"\n operator = \"!=\"\n value = \"arm64\"\n }\n\n # The \"task\" stanza creates an individual unit of work, such as a Docker\n # container, web application, or batch processing.\n #\n # https://www.nomadproject.io/docs/job-specification/task\n #\n task \"prod-task1-nodeexporter-amd64\" {\n # The \"driver\" parameter specifies the task driver that should be used to\n # run the task.\n driver = \"raw_exec\"\n\n # The \"config\" stanza specifies the driver configuration, which is passed\n # directly to the driver to start the task. The details of configurations\n # are specific to each driver, so please see specific driver\n # documentation for more information.\n config {\n command = \"local/node_exporter-1.0.1.linux-amd64/node_exporter\"\n }\n\n # The artifact stanza instructs Nomad to fetch and unpack a remote resource,\n # such as a file, tarball, or binary. Nomad downloads artifacts using the\n # popular go-getter library, which permits downloading artifacts from a\n # variety of locations using a URL as the input source.\n #\n # https://www.nomadproject.io/docs/job-specification/artifact\n #\n artifact {\n source = \"https://github.com/prometheus/node_exporter/releases/download/v1.0.1/node_exporter-1.0.1.linux-amd64.tar.gz\"\n }\n\n # The service stanza instructs Nomad to register a service with Consul.\n #\n # https://www.nomadproject.io/docs/job-specification/service\n #\n service {\n name = \"nodeexporter\"\n port = \"nodeexporter\"\n check {\n name = \"Node Exporter Check Live\"\n type = \"http\"\n path = \"/metrics\"\n interval = \"10s\"\n timeout = \"2s\"\n }\n }\n\n # The \"resources\" stanza describes the requirements a task needs to\n # execute. Resource requirements include memory, network, cpu, and more.\n # This ensures the task will execute on a machine that contains enough\n # resource capacity.\n #\n # https://www.nomadproject.io/docs/job-specification/resources\n #\n resources {\n cpu = 500\n # The network stanza specifies the networking requirements for the task\n # group, including the network mode and port allocations. When scheduling\n # jobs in Nomad they are provisioned across your fleet of machines along\n # with other jobs and services. Because you don't know in advance what host\n # your job will be provisioned on, Nomad will provide your tasks with\n # network configuration when they start up.\n #\n # https://www.nomadproject.io/docs/job-specification/network\n #\n network {\n port \"nodeexporter\" {\n static = 9100\n }\n }\n }\n }\n task \"prod-task2-blackboxexporter-amd64\" {\n # The \"driver\" parameter specifies the task driver that should be used to\n # run the task.\n driver = \"exec\"\n\n # The \"config\" stanza specifies the driver configuration, which is passed\n # directly to the driver to start the task. The details of configurations\n # are specific to each driver, so please see specific driver\n # documentation for more information.\n config {\n command = \"local/blackbox_exporter-0.18.0.linux-amd64/blackbox_exporter\"\n args = [\n \"--config.file=secrets/blackbox.yml\"\n ]\n }\n\n # The \"template\" stanza instructs Nomad to manage a template, such as\n # a configuration file or script. This template can optionally pull data\n # from Consul or Vault to populate runtime configuration data.\n #\n # https://www.nomadproject.io/docs/job-specification/template\n #\n template {\n change_mode = \"noop\"\n change_signal = \"SIGINT\"\n destination = \"secrets/blackbox.yml\"\n data = \u003c\u003cEOH\nmodules:\n http_2xx:\n prober: http\n timeout: 5s\n http:\n valid_http_versions: [\"HTTP/1.1\", \"HTTP/2.0\"]\n no_follow_redirects: false\n fail_if_ssl: false\n fail_if_not_ssl: true\n tls_config:\n insecure_skip_verify: false\n preferred_ip_protocol: \"ip4\"\n icmp_v4:\n prober: icmp\n timeout: 5s\n icmp:\n preferred_ip_protocol: \"ip4\"\n dns_udp:\n prober: dns\n timeout: 5s\n dns:\n query_name: \"jenkins.fd.io\"\n query_type: \"A\"\n valid_rcodes:\n - NOERROR\nEOH\n }\n\n # The artifact stanza instructs Nomad to fetch and unpack a remote resource,\n # such as a file, tarball, or binary. Nomad downloads artifacts using the\n # popular go-getter library, which permits downloading artifacts from a\n # variety of locations using a URL as the input source.\n #\n # https://www.nomadproject.io/docs/job-specification/artifact\n #\n artifact {\n source = \"https://github.com/prometheus/blackbox_exporter/releases/download/v0.18.0/blackbox_exporter-0.18.0.linux-amd64.tar.gz\"\n }\n\n # The service stanza instructs Nomad to register a service with Consul.\n #\n # https://www.nomadproject.io/docs/job-specification/service\n #\n service {\n name = \"blackboxexporter\"\n port = \"blackboxexporter\"\n tags = [ \"blackboxexporter${NOMAD_ALLOC_INDEX}\" ]\n check {\n name = \"Blackbox Exporter Check Live\"\n type = \"http\"\n path = \"/metrics\"\n interval = \"10s\"\n timeout = \"2s\"\n }\n }\n\n # The \"resources\" stanza describes the requirements a task needs to\n # execute. Resource requirements include memory, network, cpu, and more.\n # This ensures the task will execute on a machine that contains enough\n # resource capacity.\n #\n # https://www.nomadproject.io/docs/job-specification/resources\n #\n resources {\n cpu = 500\n # The network stanza specifies the networking requirements for the task\n # group, including the network mode and port allocations. When scheduling\n # jobs in Nomad they are provisioned across your fleet of machines along\n # with other jobs and services. Because you don't know in advance what host\n # your job will be provisioned on, Nomad will provide your tasks with\n # network configuration when they start up.\n #\n # https://www.nomadproject.io/docs/job-specification/network\n #\n network {\n port \"blackboxexporter\" {\n static = 9115\n }\n }\n }\n }\n\n task \"prod-task3-cadvisorexporter-amd64\" {\n # The \"driver\" parameter specifies the task driver that should be used to\n # run the task.\n driver = \"docker\"\n\n # The \"config\" stanza specifies the driver configuration, which is passed\n # directly to the driver to start the task. The details of configurations\n # are specific to each driver, so please see specific driver\n # documentation for more information.\n config {\n image = \"gcr.io/cadvisor/cadvisor:latest\"\n volumes = [\n \"/:/rootfs:ro\",\n \"/var/run:/var/run:rw\",\n \"/sys:/sys:ro\",\n \"/var/lib/docker/:/var/lib/docker:ro\",\n \"/cgroup:/cgroup:ro\"\n ]\n }\n\n # The service stanza instructs Nomad to register a service with Consul.\n #\n # https://www.nomadproject.io/docs/job-specification/service\n #\n service {\n name = \"cadvisorexporter\"\n port = \"cadvisorexporter\"\n check {\n name = \"cAdvisor Check Live\"\n type = \"http\"\n path = \"/metrics\"\n interval = \"10s\"\n timeout = \"2s\"\n }\n }\n\n # The \"resources\" stanza describes the requirements a task needs to\n # execute. Resource requirements include memory, network, cpu, and more.\n # This ensures the task will execute on a machine that contains enough\n # resource capacity.\n #\n # https://www.nomadproject.io/docs/job-specification/resources\n #\n resources {\n cpu = 500\n # The network stanza specifies the networking requirements for the task\n # group, including the network mode and port allocations. When scheduling\n # jobs in Nomad they are provisioned across your fleet of machines along\n # with other jobs and services. Because you don't know in advance what host\n # your job will be provisioned on, Nomad will provide your tasks with\n # network configuration when they start up.\n #\n # https://www.nomadproject.io/docs/job-specification/network\n #\n network {\n port \"cadvisorexporter\" {\n static = 8080\n }\n }\n }\n }\n }\n\n group \"prod-group1-exporter-arm64\" {\n # The constraint allows restricting the set of eligible nodes. Constraints\n # may filter on attributes or client metadata.\n #\n # https://www.nomadproject.io/docs/job-specification/constraint\n #\n constraint {\n attribute = \"${attr.cpu.arch}\"\n operator = \"==\"\n value = \"arm64\"\n }\n\n # The \"task\" stanza creates an individual unit of work, such as a Docker\n # container, web application, or batch processing.\n #\n # https://www.nomadproject.io/docs/job-specification/task\n #\n task \"prod-task1-nodeexporter-arm64\" {\n # The \"driver\" parameter specifies the task driver that should be used to\n # run the task.\n driver = \"raw_exec\"\n\n # The \"config\" stanza specifies the driver configuration, which is passed\n # directly to the driver to start the task. The details of configurations\n # are specific to each driver, so please see specific driver\n # documentation for more information.\n config {\n command = \"local/node_exporter-1.0.1.linux-arm64/node_exporter\"\n }\n\n # The artifact stanza instructs Nomad to fetch and unpack a remote resource,\n # such as a file, tarball, or binary. Nomad downloads artifacts using the\n # popular go-getter library, which permits downloading artifacts from a\n # variety of locations using a URL as the input source.\n #\n # https://www.nomadproject.io/docs/job-specification/artifact\n #\n artifact {\n source = \"https://github.com/prometheus/node_exporter/releases/download/v1.0.1/node_exporter-1.0.1.linux-arm64.tar.gz\"\n }\n\n # The service stanza instructs Nomad to register a service with Consul.\n #\n # https://www.nomadproject.io/docs/job-specification/service\n #\n service {\n name = \"nodeexporter\"\n port = \"nodeexporter\"\n check {\n name = \"Node Exporter Check Live\"\n type = \"http\"\n path = \"/metrics\"\n interval = \"10s\"\n timeout = \"2s\"\n }\n }\n\n # The \"resources\" stanza describes the requirements a task needs to\n # execute. Resource requirements include memory, network, cpu, and more.\n # This ensures the task will execute on a machine that contains enough\n # resource capacity.\n #\n # https://www.nomadproject.io/docs/job-specification/resources\n #\n resources {\n cpu = 500\n # The network stanza specifies the networking requirements for the task\n # group, including the network mode and port allocations. When scheduling\n # jobs in Nomad they are provisioned across your fleet of machines along\n # with other jobs and services. Because you don't know in advance what host\n # your job will be provisioned on, Nomad will provide your tasks with\n # network configuration when they start up.\n #\n # https://www.nomadproject.io/docs/job-specification/network\n #\n network {\n port \"nodeexporter\" {\n static = 9100\n }\n }\n }\n }\n\n task \"prod-task2-blackboxexporter-arm64\" {\n # The \"driver\" parameter specifies the task driver that should be used to\n # run the task.\n driver = \"exec\"\n\n # The \"config\" stanza specifies the driver configuration, which is passed\n # directly to the driver to start the task. The details of configurations\n # are specific to each driver, so please see specific driver\n # documentation for more information.\n config {\n command = \"local/blackbox_exporter-0.18.0.linux-arm64/blackbox_exporter\"\n args = [\n \"--config.file=secrets/blackbox.yml\"\n ]\n }\n\n # The \"template\" stanza instructs Nomad to manage a template, such as\n # a configuration file or script. This template can optionally pull data\n # from Consul or Vault to populate runtime configuration data.\n #\n # https://www.nomadproject.io/docs/job-specification/template\n #\n template {\n change_mode = \"noop\"\n change_signal = \"SIGINT\"\n destination = \"secrets/blackbox.yml\"\n data = \u003c\u003cEOH\nmodules:\n http_2xx:\n prober: http\n timeout: 5s\n http:\n valid_http_versions: [\"HTTP/1.1\", \"HTTP/2.0\"]\n no_follow_redirects: false\n fail_if_ssl: false\n fail_if_not_ssl: true\n tls_config:\n insecure_skip_verify: false\n preferred_ip_protocol: \"ip4\"\n icmp_v4:\n prober: icmp\n timeout: 5s\n icmp:\n preferred_ip_protocol: \"ip4\"\n dns_udp:\n prober: dns\n timeout: 5s\n dns:\n query_name: \"jenkins.fd.io\"\n query_type: \"A\"\n valid_rcodes:\n - NOERROR\nEOH\n }\n\n # The artifact stanza instructs Nomad to fetch and unpack a remote resource,\n # such as a file, tarball, or binary. Nomad downloads artifacts using the\n # popular go-getter library, which permits downloading artifacts from a\n # variety of locations using a URL as the input source.\n #\n # https://www.nomadproject.io/docs/job-specification/artifact\n #\n artifact {\n source = \"https://github.com/prometheus/blackbox_exporter/releases/download/v0.18.0/blackbox_exporter-0.18.0.linux-arm64.tar.gz\"\n }\n\n # The service stanza instructs Nomad to register a service with Consul.\n #\n # https://www.nomadproject.io/docs/job-specification/service\n #\n service {\n name = \"blackboxexporter\"\n port = \"blackboxexporter\"\n tags = [ \"blackboxexporter${NOMAD_ALLOC_INDEX}\" ]\n check {\n name = \"Blackbox Exporter Check Live\"\n type = \"http\"\n path = \"/metrics\"\n interval = \"10s\"\n timeout = \"2s\"\n }\n }\n\n # The \"resources\" stanza describes the requirements a task needs to\n # execute. Resource requirements include memory, network, cpu, and more.\n # This ensures the task will execute on a machine that contains enough\n # resource capacity.\n #\n # https://www.nomadproject.io/docs/job-specification/resources\n #\n resources {\n cpu = 500\n # The network stanza specifies the networking requirements for the task\n # group, including the network mode and port allocations. When scheduling\n # jobs in Nomad they are provisioned across your fleet of machines along\n # with other jobs and services. Because you don't know in advance what host\n # your job will be provisioned on, Nomad will provide your tasks with\n # network configuration when they start up.\n #\n # https://www.nomadproject.io/docs/job-specification/network\n #\n network {\n port \"blackboxexporter\" {\n static = 9115\n }\n }\n }\n }\n\n task \"prod-task3-cadvisorexporter-arm64\" {\n # The \"driver\" parameter specifies the task driver that should be used to\n # run the task.\n driver = \"docker\"\n\n # The \"config\" stanza specifies the driver configuration, which is passed\n # directly to the driver to start the task. The details of configurations\n # are specific to each driver, so please see specific driver\n # documentation for more information.\n config {\n # There is currently no official release for arm yet...using community.\n image = \"zcube/cadvisor:latest\"\n volumes = [\n \"/:/rootfs:ro\",\n \"/var/run:/var/run:rw\",\n \"/sys:/sys:ro\",\n \"/var/lib/docker/:/var/lib/docker:ro\",\n \"/cgroup:/cgroup:ro\"\n ]\n }\n\n # The service stanza instructs Nomad to register a service with Consul.\n #\n # https://www.nomadproject.io/docs/job-specification/service\n #\n service {\n name = \"cadvisorexporter\"\n port = \"cadvisorexporter\"\n check {\n name = \"cAdvisor Check Live\"\n type = \"http\"\n path = \"/metrics\"\n interval = \"10s\"\n timeout = \"2s\"\n }\n }\n\n # The \"resources\" stanza describes the requirements a task needs to\n # execute. Resource requirements include memory, network, cpu, and more.\n # This ensures the task will execute on a machine that contains enough\n # resource capacity.\n #\n # https://www.nomadproject.io/docs/job-specification/resources\n #\n resources {\n cpu = 500\n # The network stanza specifies the networking requirements for the task\n # group, including the network mode and port allocations. When scheduling\n # jobs in Nomad they are provisioned across your fleet of machines along\n # with other jobs and services. Because you don't know in advance what host\n # your job will be provisioned on, Nomad will provide your tasks with\n # network configuration when they start up.\n #\n # https://www.nomadproject.io/docs/job-specification/network\n #\n network {\n port \"cadvisorexporter\" {\n static = 8080\n }\n }\n }\n }\n }\n}",
+ "template": "job \"${job_name}\" {\n # The \"region\" parameter specifies the region in which to execute the job.\n # If omitted, this inherits the default region name of \"global\".\n # region = \"global\"\n #\n # The \"datacenters\" parameter specifies the list of datacenters which should\n # be considered when placing this task. This must be provided.\n datacenters = \"${datacenters}\"\n\n # The \"type\" parameter controls the type of job, which impacts the scheduler's\n # decision on placement. This configuration is optional and defaults to\n # \"service\". For a full list of job types and their differences, please see\n # the online documentation.\n #\n # https://www.nomadproject.io/docs/jobspec/schedulers\n #\n type = \"system\"\n\n update {\n # The \"max_parallel\" parameter specifies the maximum number of updates to\n # perform in parallel. In this case, this specifies to update a single task\n # at a time.\n max_parallel = 1\n\n health_check = \"checks\"\n\n # The \"min_healthy_time\" parameter specifies the minimum time the allocation\n # must be in the healthy state before it is marked as healthy and unblocks\n # further allocations from being updated.\n min_healthy_time = \"10s\"\n\n # The \"healthy_deadline\" parameter specifies the deadline in which the\n # allocation must be marked as healthy after which the allocation is\n # automatically transitioned to unhealthy. Transitioning to unhealthy will\n # fail the deployment and potentially roll back the job if \"auto_revert\" is\n # set to true.\n healthy_deadline = \"3m\"\n\n # The \"progress_deadline\" parameter specifies the deadline in which an\n # allocation must be marked as healthy. The deadline begins when the first\n # allocation for the deployment is created and is reset whenever an allocation\n # as part of the deployment transitions to a healthy state. If no allocation\n # transitions to the healthy state before the progress deadline, the\n # deployment is marked as failed.\n progress_deadline = \"10m\"\n\n%{ if use_canary }\n # The \"canary\" parameter specifies that changes to the job that would result\n # in destructive updates should create the specified number of canaries\n # without stopping any previous allocations. Once the operator determines the\n # canaries are healthy, they can be promoted which unblocks a rolling update\n # of the remaining allocations at a rate of \"max_parallel\".\n #\n # Further, setting \"canary\" equal to the count of the task group allows\n # blue/green deployments. When the job is updated, a full set of the new\n # version is deployed and upon promotion the old version is stopped.\n canary = 1\n\n # Specifies if the job should auto-promote to the canary version when all\n # canaries become healthy during a deployment. Defaults to false which means\n # canaries must be manually updated with the nomad deployment promote\n # command.\n auto_promote = true\n\n # The \"auto_revert\" parameter specifies if the job should auto-revert to the\n # last stable job on deployment failure. A job is marked as stable if all the\n # allocations as part of its deployment were marked healthy.\n auto_revert = true\n%{ endif }\n }\n\n # The \"group\" stanza defines a series of tasks that should be co-located on\n # the same Nomad client. Any task within a group will be placed on the same\n # client.\n #\n # https://www.nomadproject.io/docs/job-specification/group\n #\n group \"prod-group1-exporter-amd64\" {\n # The constraint allows restricting the set of eligible nodes. Constraints\n # may filter on attributes or client metadata.\n #\n # https://www.nomadproject.io/docs/job-specification/constraint\n #\n constraint {\n attribute = \"$${attr.cpu.arch}\"\n operator = \"!=\"\n value = \"arm64\"\n }\n\n # The \"task\" stanza creates an individual unit of work, such as a Docker\n # container, web application, or batch processing.\n #\n # https://www.nomadproject.io/docs/job-specification/task\n #\n task \"prod-task1-${node_service_name}-amd64\" {\n # The \"driver\" parameter specifies the task driver that should be used to\n # run the task.\n driver = \"raw_exec\"\n\n # The \"config\" stanza specifies the driver configuration, which is passed\n # directly to the driver to start the task. The details of configurations\n # are specific to each driver, so please see specific driver\n # documentation for more information.\n config {\n command = \"local/node_exporter-${node_version}.linux-amd64/node_exporter\"\n }\n\n # The artifact stanza instructs Nomad to fetch and unpack a remote resource,\n # such as a file, tarball, or binary. Nomad downloads artifacts using the\n # popular go-getter library, which permits downloading artifacts from a\n # variety of locations using a URL as the input source.\n #\n # https://www.nomadproject.io/docs/job-specification/artifact\n #\n artifact {\n source = \"${node_url_amd64}\"\n }\n\n # The service stanza instructs Nomad to register a service with Consul.\n #\n # https://www.nomadproject.io/docs/job-specification/service\n #\n service {\n name = \"${node_service_name}\"\n port = \"${node_service_name}\"\n check {\n name = \"Node Exporter Check Live\"\n type = \"http\"\n path = \"/metrics\"\n interval = \"10s\"\n timeout = \"2s\"\n }\n }\n\n # The \"resources\" stanza describes the requirements a task needs to\n # execute. Resource requirements include memory, network, cpu, and more.\n # This ensures the task will execute on a machine that contains enough\n # resource capacity.\n #\n # https://www.nomadproject.io/docs/job-specification/resources\n #\n resources {\n cpu = 500\n # The network stanza specifies the networking requirements for the task\n # group, including the network mode and port allocations. When scheduling\n # jobs in Nomad they are provisioned across your fleet of machines along\n # with other jobs and services. Because you don't know in advance what host\n # your job will be provisioned on, Nomad will provide your tasks with\n # network configuration when they start up.\n #\n # https://www.nomadproject.io/docs/job-specification/network\n #\n network {\n port \"${node_service_name}\" {\n static = ${node_port}\n }\n }\n }\n }\n task \"prod-task2-${blackbox_service_name}-amd64\" {\n # The \"driver\" parameter specifies the task driver that should be used to\n # run the task.\n driver = \"exec\"\n\n # The \"config\" stanza specifies the driver configuration, which is passed\n # directly to the driver to start the task. The details of configurations\n # are specific to each driver, so please see specific driver\n # documentation for more information.\n config {\n command = \"local/blackbox_exporter-${blackbox_version}.linux-amd64/blackbox_exporter\"\n args = [\n \"--config.file=secrets/blackbox.yml\"\n ]\n }\n\n # The \"template\" stanza instructs Nomad to manage a template, such as\n # a configuration file or script. This template can optionally pull data\n # from Consul or Vault to populate runtime configuration data.\n #\n # https://www.nomadproject.io/docs/job-specification/template\n #\n template {\n change_mode = \"noop\"\n change_signal = \"SIGINT\"\n destination = \"secrets/blackbox.yml\"\n data = \u003c\u003cEOH\nmodules:\n http_2xx:\n prober: http\n timeout: 5s\n http:\n valid_http_versions: [\"HTTP/1.1\", \"HTTP/2.0\"]\n no_follow_redirects: false\n fail_if_ssl: false\n fail_if_not_ssl: true\n tls_config:\n insecure_skip_verify: false\n preferred_ip_protocol: \"ip4\"\n icmp_v4:\n prober: icmp\n timeout: 5s\n icmp:\n preferred_ip_protocol: \"ip4\"\n dns_udp:\n prober: dns\n timeout: 5s\n dns:\n query_name: \"jenkins.fd.io\"\n query_type: \"A\"\n valid_rcodes:\n - NOERROR\nEOH\n }\n\n # The artifact stanza instructs Nomad to fetch and unpack a remote resource,\n # such as a file, tarball, or binary. Nomad downloads artifacts using the\n # popular go-getter library, which permits downloading artifacts from a\n # variety of locations using a URL as the input source.\n #\n # https://www.nomadproject.io/docs/job-specification/artifact\n #\n artifact {\n source = \"${blackbox_url_amd64}\"\n }\n\n # The service stanza instructs Nomad to register a service with Consul.\n #\n # https://www.nomadproject.io/docs/job-specification/service\n #\n service {\n name = \"${blackbox_service_name}\"\n port = \"${blackbox_service_name}\"\n tags = [ \"${blackbox_service_name}$${NOMAD_ALLOC_INDEX}\" ]\n check {\n name = \"Blackbox Exporter Check Live\"\n type = \"http\"\n path = \"/metrics\"\n interval = \"10s\"\n timeout = \"2s\"\n }\n }\n\n # The \"resources\" stanza describes the requirements a task needs to\n # execute. Resource requirements include memory, network, cpu, and more.\n # This ensures the task will execute on a machine that contains enough\n # resource capacity.\n #\n # https://www.nomadproject.io/docs/job-specification/resources\n #\n resources {\n cpu = 500\n # The network stanza specifies the networking requirements for the task\n # group, including the network mode and port allocations. When scheduling\n # jobs in Nomad they are provisioned across your fleet of machines along\n # with other jobs and services. Because you don't know in advance what host\n # your job will be provisioned on, Nomad will provide your tasks with\n # network configuration when they start up.\n #\n # https://www.nomadproject.io/docs/job-specification/network\n #\n network {\n port \"${blackbox_service_name}\" {\n static = ${blackbox_port}\n }\n }\n }\n }\n\n task \"prod-task3-${cadvisor_service_name}-amd64\" {\n # The \"driver\" parameter specifies the task driver that should be used to\n # run the task.\n driver = \"docker\"\n\n # The \"config\" stanza specifies the driver configuration, which is passed\n # directly to the driver to start the task. The details of configurations\n # are specific to each driver, so please see specific driver\n # documentation for more information.\n config {\n image = \"${cadvisor_image}\"\n volumes = [\n \"/:/rootfs:ro\",\n \"/var/run:/var/run:rw\",\n \"/sys:/sys:ro\",\n \"/var/lib/docker/:/var/lib/docker:ro\",\n \"/cgroup:/cgroup:ro\"\n ]\n }\n\n # The service stanza instructs Nomad to register a service with Consul.\n #\n # https://www.nomadproject.io/docs/job-specification/service\n #\n service {\n name = \"${cadvisor_service_name}\"\n port = \"${cadvisor_service_name}\"\n check {\n name = \"cAdvisor Check Live\"\n type = \"http\"\n path = \"/metrics\"\n interval = \"10s\"\n timeout = \"2s\"\n }\n }\n\n # The \"resources\" stanza describes the requirements a task needs to\n # execute. Resource requirements include memory, network, cpu, and more.\n # This ensures the task will execute on a machine that contains enough\n # resource capacity.\n #\n # https://www.nomadproject.io/docs/job-specification/resources\n #\n resources {\n cpu = 500\n # The network stanza specifies the networking requirements for the task\n # group, including the network mode and port allocations. When scheduling\n # jobs in Nomad they are provisioned across your fleet of machines along\n # with other jobs and services. Because you don't know in advance what host\n # your job will be provisioned on, Nomad will provide your tasks with\n # network configuration when they start up.\n #\n # https://www.nomadproject.io/docs/job-specification/network\n #\n network {\n port \"${cadvisor_service_name}\" {\n static = ${cadvisor_port}\n }\n }\n }\n }\n }\n\n group \"prod-group1-exporter-arm64\" {\n # The constraint allows restricting the set of eligible nodes. Constraints\n # may filter on attributes or client metadata.\n #\n # https://www.nomadproject.io/docs/job-specification/constraint\n #\n constraint {\n attribute = \"$${attr.cpu.arch}\"\n operator = \"==\"\n value = \"arm64\"\n }\n\n # The \"task\" stanza creates an individual unit of work, such as a Docker\n # container, web application, or batch processing.\n #\n # https://www.nomadproject.io/docs/job-specification/task\n #\n task \"prod-task1-${node_service_name}-arm64\" {\n # The \"driver\" parameter specifies the task driver that should be used to\n # run the task.\n driver = \"raw_exec\"\n\n # The \"config\" stanza specifies the driver configuration, which is passed\n # directly to the driver to start the task. The details of configurations\n # are specific to each driver, so please see specific driver\n # documentation for more information.\n config {\n command = \"local/node_exporter-${node_version}.linux-arm64/node_exporter\"\n }\n\n # The artifact stanza instructs Nomad to fetch and unpack a remote resource,\n # such as a file, tarball, or binary. Nomad downloads artifacts using the\n # popular go-getter library, which permits downloading artifacts from a\n # variety of locations using a URL as the input source.\n #\n # https://www.nomadproject.io/docs/job-specification/artifact\n #\n artifact {\n source = \"${node_url_arm64}\"\n }\n\n # The service stanza instructs Nomad to register a service with Consul.\n #\n # https://www.nomadproject.io/docs/job-specification/service\n #\n service {\n name = \"${node_service_name}\"\n port = \"${node_service_name}\"\n check {\n name = \"Node Exporter Check Live\"\n type = \"http\"\n path = \"/metrics\"\n interval = \"10s\"\n timeout = \"2s\"\n }\n }\n\n # The \"resources\" stanza describes the requirements a task needs to\n # execute. Resource requirements include memory, network, cpu, and more.\n # This ensures the task will execute on a machine that contains enough\n # resource capacity.\n #\n # https://www.nomadproject.io/docs/job-specification/resources\n #\n resources {\n cpu = 500\n # The network stanza specifies the networking requirements for the task\n # group, including the network mode and port allocations. When scheduling\n # jobs in Nomad they are provisioned across your fleet of machines along\n # with other jobs and services. Because you don't know in advance what host\n # your job will be provisioned on, Nomad will provide your tasks with\n # network configuration when they start up.\n #\n # https://www.nomadproject.io/docs/job-specification/network\n #\n network {\n port \"${node_service_name}\" {\n static = ${node_port}\n }\n }\n }\n }\n\n task \"prod-task2-${blackbox_service_name}-arm64\" {\n # The \"driver\" parameter specifies the task driver that should be used to\n # run the task.\n driver = \"exec\"\n\n # The \"config\" stanza specifies the driver configuration, which is passed\n # directly to the driver to start the task. The details of configurations\n # are specific to each driver, so please see specific driver\n # documentation for more information.\n config {\n command = \"local/blackbox_exporter-${blackbox_version}.linux-arm64/blackbox_exporter\"\n args = [\n \"--config.file=secrets/blackbox.yml\"\n ]\n }\n\n # The \"template\" stanza instructs Nomad to manage a template, such as\n # a configuration file or script. This template can optionally pull data\n # from Consul or Vault to populate runtime configuration data.\n #\n # https://www.nomadproject.io/docs/job-specification/template\n #\n template {\n change_mode = \"noop\"\n change_signal = \"SIGINT\"\n destination = \"secrets/blackbox.yml\"\n data = \u003c\u003cEOH\nmodules:\n http_2xx:\n prober: http\n timeout: 5s\n http:\n valid_http_versions: [\"HTTP/1.1\", \"HTTP/2.0\"]\n no_follow_redirects: false\n fail_if_ssl: false\n fail_if_not_ssl: true\n tls_config:\n insecure_skip_verify: false\n preferred_ip_protocol: \"ip4\"\n icmp_v4:\n prober: icmp\n timeout: 5s\n icmp:\n preferred_ip_protocol: \"ip4\"\n dns_udp:\n prober: dns\n timeout: 5s\n dns:\n query_name: \"jenkins.fd.io\"\n query_type: \"A\"\n valid_rcodes:\n - NOERROR\nEOH\n }\n\n # The artifact stanza instructs Nomad to fetch and unpack a remote resource,\n # such as a file, tarball, or binary. Nomad downloads artifacts using the\n # popular go-getter library, which permits downloading artifacts from a\n # variety of locations using a URL as the input source.\n #\n # https://www.nomadproject.io/docs/job-specification/artifact\n #\n artifact {\n source = \"${blackbox_url_arm64}\"\n }\n\n # The service stanza instructs Nomad to register a service with Consul.\n #\n # https://www.nomadproject.io/docs/job-specification/service\n #\n service {\n name = \"${blackbox_service_name}\"\n port = \"${blackbox_service_name}\"\n tags = [ \"${blackbox_service_name}$${NOMAD_ALLOC_INDEX}\" ]\n check {\n name = \"Blackbox Exporter Check Live\"\n type = \"http\"\n path = \"/metrics\"\n interval = \"10s\"\n timeout = \"2s\"\n }\n }\n\n # The \"resources\" stanza describes the requirements a task needs to\n # execute. Resource requirements include memory, network, cpu, and more.\n # This ensures the task will execute on a machine that contains enough\n # resource capacity.\n #\n # https://www.nomadproject.io/docs/job-specification/resources\n #\n resources {\n cpu = 500\n # The network stanza specifies the networking requirements for the task\n # group, including the network mode and port allocations. When scheduling\n # jobs in Nomad they are provisioned across your fleet of machines along\n # with other jobs and services. Because you don't know in advance what host\n # your job will be provisioned on, Nomad will provide your tasks with\n # network configuration when they start up.\n #\n # https://www.nomadproject.io/docs/job-specification/network\n #\n network {\n port \"${blackbox_service_name}\" {\n static = ${blackbox_port}\n }\n }\n }\n }\n\n task \"prod-task3-${cadvisor_service_name}-arm64\" {\n # The \"driver\" parameter specifies the task driver that should be used to\n # run the task.\n driver = \"docker\"\n\n # The \"config\" stanza specifies the driver configuration, which is passed\n # directly to the driver to start the task. The details of configurations\n # are specific to each driver, so please see specific driver\n # documentation for more information.\n config {\n # There is currently no official release for arm yet...using community.\n image = \"zcube/cadvisor:latest\"\n volumes = [\n \"/:/rootfs:ro\",\n \"/var/run:/var/run:rw\",\n \"/sys:/sys:ro\",\n \"/var/lib/docker/:/var/lib/docker:ro\",\n \"/cgroup:/cgroup:ro\"\n ]\n }\n\n # The service stanza instructs Nomad to register a service with Consul.\n #\n # https://www.nomadproject.io/docs/job-specification/service\n #\n service {\n name = \"${cadvisor_service_name}\"\n port = \"${cadvisor_service_name}\"\n check {\n name = \"cAdvisor Check Live\"\n type = \"http\"\n path = \"/metrics\"\n interval = \"10s\"\n timeout = \"2s\"\n }\n }\n\n # The \"resources\" stanza describes the requirements a task needs to\n # execute. Resource requirements include memory, network, cpu, and more.\n # This ensures the task will execute on a machine that contains enough\n # resource capacity.\n #\n # https://www.nomadproject.io/docs/job-specification/resources\n #\n resources {\n cpu = 500\n # The network stanza specifies the networking requirements for the task\n # group, including the network mode and port allocations. When scheduling\n # jobs in Nomad they are provisioned across your fleet of machines along\n # with other jobs and services. Because you don't know in advance what host\n # your job will be provisioned on, Nomad will provide your tasks with\n # network configuration when they start up.\n #\n # https://www.nomadproject.io/docs/job-specification/network\n #\n network {\n port \"${cadvisor_service_name}\" {\n static = ${cadvisor_port}\n }\n }\n }\n }\n }\n}",
+ "vars": {
+ "blackbox_port": "9115",
+ "blackbox_service_name": "blackboxexporter",
+ "blackbox_url_amd64": "https://github.com/prometheus/blackbox_exporter/releases/download/v0.18.0/blackbox_exporter-0.18.0.linux-amd64.tar.gz",
+ "blackbox_url_arm64": "https://github.com/prometheus/blackbox_exporter/releases/download/v0.18.0/blackbox_exporter-0.18.0.linux-arm64.tar.gz",
+ "blackbox_version": "0.18.0",
+ "cadvisor_image": "gcr.io/cadvisor/cadvisor:latest",
+ "cadvisor_port": "8080",
+ "cadvisor_service_name": "cadvisorexporter",
+ "datacenters": "yul1",
+ "job_name": "prod-exporter",
+ "node_port": "9100",
+ "node_service_name": "nodeexporter",
+ "node_url_amd64": "https://github.com/prometheus/node_exporter/releases/download/v1.0.1/node_exporter-1.0.1.linux-amd64.tar.gz",
+ "node_url_arm64": "https://github.com/prometheus/node_exporter/releases/download/v1.0.1/node_exporter-1.0.1.linux-arm64.tar.gz",
+ "node_version": "1.0.1",
+ "use_canary": "false"
+ }
+ },
+ "sensitive_attributes": []
+ }
+ ]
+ },
+ {
+ "module": "module.exporter",
+ "mode": "managed",
+ "type": "nomad_job",
+ "name": "nomad_job_exporter",
+ "provider": "provider[\"registry.terraform.io/hashicorp/nomad\"].yul1",
+ "instances": [
+ {
+ "schema_version": 0,
+ "attributes": {
+ "allocation_ids": [
+ "e11d14cf-0b12-7f11-f982-e75abb2fa245",
+ "be1d7792-8b38-0769-1899-4d78ac17d775",
+ "33190f8d-6b97-5a79-32c3-c2a48eb33cab",
+ "7b9a89f3-3094-73aa-de9e-b674cdf3962b",
+ "1b730294-e8c6-d330-72a6-7502ef6d8195",
+ "3b3932ce-d57e-d0c6-0d36-b32724d83a0f",
+ "d07855d6-29a8-dba5-92a8-bed5673c3b81",
+ "4ea7fa69-4819-5050-37ea-bbce995e994d",
+ "2c694f1a-fb0e-67b9-407a-dd5f1d72daa0",
+ "52e828d4-79a7-9e43-2cdb-1ba6088f3257",
+ "e23fc3eb-1699-e1e8-629b-82a21b72ecf4",
+ "6f83b8c0-4313-eb5a-9c22-e8db2226e832",
+ "15ee749c-07f0-2864-f907-919faa62cee2",
+ "12025e7c-4a27-9a9b-b0aa-a205179139eb",
+ "a6234b96-8c7b-1fd4-e85f-d361774d3005",
+ "fadf8b49-0e1c-98ea-42b5-119667fa092e",
+ "2cb8905c-3d36-878d-917e-493f8a982970",
+ "a1426c7b-a4a1-0f19-cc11-162b396e9ce9",
+ "f24eee7b-e480-3386-9a6e-357edb9ae137",
+ "7b7c32b2-9cc9-bb99-b203-2842e4495f5c"
+ ],
+ "datacenters": [
+ "yul1"
+ ],
+ "deployment_id": "",
+ "deployment_status": "",
+ "deregister_on_destroy": true,
+ "deregister_on_id_change": true,
+ "detach": false,
+ "id": "prod-exporter",
+ "jobspec": "job \"prod-exporter\" {\n # The \"region\" parameter specifies the region in which to execute the job.\n # If omitted, this inherits the default region name of \"global\".\n # region = \"global\"\n #\n # The \"datacenters\" parameter specifies the list of datacenters which should\n # be considered when placing this task. This must be provided.\n datacenters = \"yul1\"\n\n # The \"type\" parameter controls the type of job, which impacts the scheduler's\n # decision on placement. This configuration is optional and defaults to\n # \"service\". For a full list of job types and their differences, please see\n # the online documentation.\n #\n # https://www.nomadproject.io/docs/jobspec/schedulers\n #\n type = \"system\"\n\n update {\n # The \"max_parallel\" parameter specifies the maximum number of updates to\n # perform in parallel. In this case, this specifies to update a single task\n # at a time.\n max_parallel = 1\n\n health_check = \"checks\"\n\n # The \"min_healthy_time\" parameter specifies the minimum time the allocation\n # must be in the healthy state before it is marked as healthy and unblocks\n # further allocations from being updated.\n min_healthy_time = \"10s\"\n\n # The \"healthy_deadline\" parameter specifies the deadline in which the\n # allocation must be marked as healthy after which the allocation is\n # automatically transitioned to unhealthy. Transitioning to unhealthy will\n # fail the deployment and potentially roll back the job if \"auto_revert\" is\n # set to true.\n healthy_deadline = \"3m\"\n\n # The \"progress_deadline\" parameter specifies the deadline in which an\n # allocation must be marked as healthy. The deadline begins when the first\n # allocation for the deployment is created and is reset whenever an allocation\n # as part of the deployment transitions to a healthy state. If no allocation\n # transitions to the healthy state before the progress deadline, the\n # deployment is marked as failed.\n progress_deadline = \"10m\"\n\n\n }\n\n # The \"group\" stanza defines a series of tasks that should be co-located on\n # the same Nomad client. Any task within a group will be placed on the same\n # client.\n #\n # https://www.nomadproject.io/docs/job-specification/group\n #\n group \"prod-group1-exporter-amd64\" {\n # The constraint allows restricting the set of eligible nodes. Constraints\n # may filter on attributes or client metadata.\n #\n # https://www.nomadproject.io/docs/job-specification/constraint\n #\n constraint {\n attribute = \"${attr.cpu.arch}\"\n operator = \"!=\"\n value = \"arm64\"\n }\n\n # The \"task\" stanza creates an individual unit of work, such as a Docker\n # container, web application, or batch processing.\n #\n # https://www.nomadproject.io/docs/job-specification/task\n #\n task \"prod-task1-nodeexporter-amd64\" {\n # The \"driver\" parameter specifies the task driver that should be used to\n # run the task.\n driver = \"raw_exec\"\n\n # The \"config\" stanza specifies the driver configuration, which is passed\n # directly to the driver to start the task. The details of configurations\n # are specific to each driver, so please see specific driver\n # documentation for more information.\n config {\n command = \"local/node_exporter-1.0.1.linux-amd64/node_exporter\"\n }\n\n # The artifact stanza instructs Nomad to fetch and unpack a remote resource,\n # such as a file, tarball, or binary. Nomad downloads artifacts using the\n # popular go-getter library, which permits downloading artifacts from a\n # variety of locations using a URL as the input source.\n #\n # https://www.nomadproject.io/docs/job-specification/artifact\n #\n artifact {\n source = \"https://github.com/prometheus/node_exporter/releases/download/v1.0.1/node_exporter-1.0.1.linux-amd64.tar.gz\"\n }\n\n # The service stanza instructs Nomad to register a service with Consul.\n #\n # https://www.nomadproject.io/docs/job-specification/service\n #\n service {\n name = \"nodeexporter\"\n port = \"nodeexporter\"\n check {\n name = \"Node Exporter Check Live\"\n type = \"http\"\n path = \"/metrics\"\n interval = \"10s\"\n timeout = \"2s\"\n }\n }\n\n # The \"resources\" stanza describes the requirements a task needs to\n # execute. Resource requirements include memory, network, cpu, and more.\n # This ensures the task will execute on a machine that contains enough\n # resource capacity.\n #\n # https://www.nomadproject.io/docs/job-specification/resources\n #\n resources {\n cpu = 500\n # The network stanza specifies the networking requirements for the task\n # group, including the network mode and port allocations. When scheduling\n # jobs in Nomad they are provisioned across your fleet of machines along\n # with other jobs and services. Because you don't know in advance what host\n # your job will be provisioned on, Nomad will provide your tasks with\n # network configuration when they start up.\n #\n # https://www.nomadproject.io/docs/job-specification/network\n #\n network {\n port \"nodeexporter\" {\n static = 9100\n }\n }\n }\n }\n task \"prod-task2-blackboxexporter-amd64\" {\n # The \"driver\" parameter specifies the task driver that should be used to\n # run the task.\n driver = \"exec\"\n\n # The \"config\" stanza specifies the driver configuration, which is passed\n # directly to the driver to start the task. The details of configurations\n # are specific to each driver, so please see specific driver\n # documentation for more information.\n config {\n command = \"local/blackbox_exporter-0.18.0.linux-amd64/blackbox_exporter\"\n args = [\n \"--config.file=secrets/blackbox.yml\"\n ]\n }\n\n # The \"template\" stanza instructs Nomad to manage a template, such as\n # a configuration file or script. This template can optionally pull data\n # from Consul or Vault to populate runtime configuration data.\n #\n # https://www.nomadproject.io/docs/job-specification/template\n #\n template {\n change_mode = \"noop\"\n change_signal = \"SIGINT\"\n destination = \"secrets/blackbox.yml\"\n data = \u003c\u003cEOH\nmodules:\n http_2xx:\n prober: http\n timeout: 5s\n http:\n valid_http_versions: [\"HTTP/1.1\", \"HTTP/2.0\"]\n no_follow_redirects: false\n fail_if_ssl: false\n fail_if_not_ssl: true\n tls_config:\n insecure_skip_verify: false\n preferred_ip_protocol: \"ip4\"\n icmp_v4:\n prober: icmp\n timeout: 5s\n icmp:\n preferred_ip_protocol: \"ip4\"\n dns_udp:\n prober: dns\n timeout: 5s\n dns:\n query_name: \"jenkins.fd.io\"\n query_type: \"A\"\n valid_rcodes:\n - NOERROR\nEOH\n }\n\n # The artifact stanza instructs Nomad to fetch and unpack a remote resource,\n # such as a file, tarball, or binary. Nomad downloads artifacts using the\n # popular go-getter library, which permits downloading artifacts from a\n # variety of locations using a URL as the input source.\n #\n # https://www.nomadproject.io/docs/job-specification/artifact\n #\n artifact {\n source = \"https://github.com/prometheus/blackbox_exporter/releases/download/v0.18.0/blackbox_exporter-0.18.0.linux-amd64.tar.gz\"\n }\n\n # The service stanza instructs Nomad to register a service with Consul.\n #\n # https://www.nomadproject.io/docs/job-specification/service\n #\n service {\n name = \"blackboxexporter\"\n port = \"blackboxexporter\"\n tags = [ \"blackboxexporter${NOMAD_ALLOC_INDEX}\" ]\n check {\n name = \"Blackbox Exporter Check Live\"\n type = \"http\"\n path = \"/metrics\"\n interval = \"10s\"\n timeout = \"2s\"\n }\n }\n\n # The \"resources\" stanza describes the requirements a task needs to\n # execute. Resource requirements include memory, network, cpu, and more.\n # This ensures the task will execute on a machine that contains enough\n # resource capacity.\n #\n # https://www.nomadproject.io/docs/job-specification/resources\n #\n resources {\n cpu = 500\n # The network stanza specifies the networking requirements for the task\n # group, including the network mode and port allocations. When scheduling\n # jobs in Nomad they are provisioned across your fleet of machines along\n # with other jobs and services. Because you don't know in advance what host\n # your job will be provisioned on, Nomad will provide your tasks with\n # network configuration when they start up.\n #\n # https://www.nomadproject.io/docs/job-specification/network\n #\n network {\n port \"blackboxexporter\" {\n static = 9115\n }\n }\n }\n }\n\n task \"prod-task3-cadvisorexporter-amd64\" {\n # The \"driver\" parameter specifies the task driver that should be used to\n # run the task.\n driver = \"docker\"\n\n # The \"config\" stanza specifies the driver configuration, which is passed\n # directly to the driver to start the task. The details of configurations\n # are specific to each driver, so please see specific driver\n # documentation for more information.\n config {\n image = \"gcr.io/cadvisor/cadvisor:latest\"\n volumes = [\n \"/:/rootfs:ro\",\n \"/var/run:/var/run:rw\",\n \"/sys:/sys:ro\",\n \"/var/lib/docker/:/var/lib/docker:ro\",\n \"/cgroup:/cgroup:ro\"\n ]\n }\n\n # The service stanza instructs Nomad to register a service with Consul.\n #\n # https://www.nomadproject.io/docs/job-specification/service\n #\n service {\n name = \"cadvisorexporter\"\n port = \"cadvisorexporter\"\n check {\n name = \"cAdvisor Check Live\"\n type = \"http\"\n path = \"/metrics\"\n interval = \"10s\"\n timeout = \"2s\"\n }\n }\n\n # The \"resources\" stanza describes the requirements a task needs to\n # execute. Resource requirements include memory, network, cpu, and more.\n # This ensures the task will execute on a machine that contains enough\n # resource capacity.\n #\n # https://www.nomadproject.io/docs/job-specification/resources\n #\n resources {\n cpu = 500\n # The network stanza specifies the networking requirements for the task\n # group, including the network mode and port allocations. When scheduling\n # jobs in Nomad they are provisioned across your fleet of machines along\n # with other jobs and services. Because you don't know in advance what host\n # your job will be provisioned on, Nomad will provide your tasks with\n # network configuration when they start up.\n #\n # https://www.nomadproject.io/docs/job-specification/network\n #\n network {\n port \"cadvisorexporter\" {\n static = 8080\n }\n }\n }\n }\n }\n\n group \"prod-group1-exporter-arm64\" {\n # The constraint allows restricting the set of eligible nodes. Constraints\n # may filter on attributes or client metadata.\n #\n # https://www.nomadproject.io/docs/job-specification/constraint\n #\n constraint {\n attribute = \"${attr.cpu.arch}\"\n operator = \"==\"\n value = \"arm64\"\n }\n\n # The \"task\" stanza creates an individual unit of work, such as a Docker\n # container, web application, or batch processing.\n #\n # https://www.nomadproject.io/docs/job-specification/task\n #\n task \"prod-task1-nodeexporter-arm64\" {\n # The \"driver\" parameter specifies the task driver that should be used to\n # run the task.\n driver = \"raw_exec\"\n\n # The \"config\" stanza specifies the driver configuration, which is passed\n # directly to the driver to start the task. The details of configurations\n # are specific to each driver, so please see specific driver\n # documentation for more information.\n config {\n command = \"local/node_exporter-1.0.1.linux-arm64/node_exporter\"\n }\n\n # The artifact stanza instructs Nomad to fetch and unpack a remote resource,\n # such as a file, tarball, or binary. Nomad downloads artifacts using the\n # popular go-getter library, which permits downloading artifacts from a\n # variety of locations using a URL as the input source.\n #\n # https://www.nomadproject.io/docs/job-specification/artifact\n #\n artifact {\n source = \"https://github.com/prometheus/node_exporter/releases/download/v1.0.1/node_exporter-1.0.1.linux-arm64.tar.gz\"\n }\n\n # The service stanza instructs Nomad to register a service with Consul.\n #\n # https://www.nomadproject.io/docs/job-specification/service\n #\n service {\n name = \"nodeexporter\"\n port = \"nodeexporter\"\n check {\n name = \"Node Exporter Check Live\"\n type = \"http\"\n path = \"/metrics\"\n interval = \"10s\"\n timeout = \"2s\"\n }\n }\n\n # The \"resources\" stanza describes the requirements a task needs to\n # execute. Resource requirements include memory, network, cpu, and more.\n # This ensures the task will execute on a machine that contains enough\n # resource capacity.\n #\n # https://www.nomadproject.io/docs/job-specification/resources\n #\n resources {\n cpu = 500\n # The network stanza specifies the networking requirements for the task\n # group, including the network mode and port allocations. When scheduling\n # jobs in Nomad they are provisioned across your fleet of machines along\n # with other jobs and services. Because you don't know in advance what host\n # your job will be provisioned on, Nomad will provide your tasks with\n # network configuration when they start up.\n #\n # https://www.nomadproject.io/docs/job-specification/network\n #\n network {\n port \"nodeexporter\" {\n static = 9100\n }\n }\n }\n }\n\n task \"prod-task2-blackboxexporter-arm64\" {\n # The \"driver\" parameter specifies the task driver that should be used to\n # run the task.\n driver = \"exec\"\n\n # The \"config\" stanza specifies the driver configuration, which is passed\n # directly to the driver to start the task. The details of configurations\n # are specific to each driver, so please see specific driver\n # documentation for more information.\n config {\n command = \"local/blackbox_exporter-0.18.0.linux-arm64/blackbox_exporter\"\n args = [\n \"--config.file=secrets/blackbox.yml\"\n ]\n }\n\n # The \"template\" stanza instructs Nomad to manage a template, such as\n # a configuration file or script. This template can optionally pull data\n # from Consul or Vault to populate runtime configuration data.\n #\n # https://www.nomadproject.io/docs/job-specification/template\n #\n template {\n change_mode = \"noop\"\n change_signal = \"SIGINT\"\n destination = \"secrets/blackbox.yml\"\n data = \u003c\u003cEOH\nmodules:\n http_2xx:\n prober: http\n timeout: 5s\n http:\n valid_http_versions: [\"HTTP/1.1\", \"HTTP/2.0\"]\n no_follow_redirects: false\n fail_if_ssl: false\n fail_if_not_ssl: true\n tls_config:\n insecure_skip_verify: false\n preferred_ip_protocol: \"ip4\"\n icmp_v4:\n prober: icmp\n timeout: 5s\n icmp:\n preferred_ip_protocol: \"ip4\"\n dns_udp:\n prober: dns\n timeout: 5s\n dns:\n query_name: \"jenkins.fd.io\"\n query_type: \"A\"\n valid_rcodes:\n - NOERROR\nEOH\n }\n\n # The artifact stanza instructs Nomad to fetch and unpack a remote resource,\n # such as a file, tarball, or binary. Nomad downloads artifacts using the\n # popular go-getter library, which permits downloading artifacts from a\n # variety of locations using a URL as the input source.\n #\n # https://www.nomadproject.io/docs/job-specification/artifact\n #\n artifact {\n source = \"https://github.com/prometheus/blackbox_exporter/releases/download/v0.18.0/blackbox_exporter-0.18.0.linux-arm64.tar.gz\"\n }\n\n # The service stanza instructs Nomad to register a service with Consul.\n #\n # https://www.nomadproject.io/docs/job-specification/service\n #\n service {\n name = \"blackboxexporter\"\n port = \"blackboxexporter\"\n tags = [ \"blackboxexporter${NOMAD_ALLOC_INDEX}\" ]\n check {\n name = \"Blackbox Exporter Check Live\"\n type = \"http\"\n path = \"/metrics\"\n interval = \"10s\"\n timeout = \"2s\"\n }\n }\n\n # The \"resources\" stanza describes the requirements a task needs to\n # execute. Resource requirements include memory, network, cpu, and more.\n # This ensures the task will execute on a machine that contains enough\n # resource capacity.\n #\n # https://www.nomadproject.io/docs/job-specification/resources\n #\n resources {\n cpu = 500\n # The network stanza specifies the networking requirements for the task\n # group, including the network mode and port allocations. When scheduling\n # jobs in Nomad they are provisioned across your fleet of machines along\n # with other jobs and services. Because you don't know in advance what host\n # your job will be provisioned on, Nomad will provide your tasks with\n # network configuration when they start up.\n #\n # https://www.nomadproject.io/docs/job-specification/network\n #\n network {\n port \"blackboxexporter\" {\n static = 9115\n }\n }\n }\n }\n\n task \"prod-task3-cadvisorexporter-arm64\" {\n # The \"driver\" parameter specifies the task driver that should be used to\n # run the task.\n driver = \"docker\"\n\n # The \"config\" stanza specifies the driver configuration, which is passed\n # directly to the driver to start the task. The details of configurations\n # are specific to each driver, so please see specific driver\n # documentation for more information.\n config {\n # There is currently no official release for arm yet...using community.\n image = \"zcube/cadvisor:latest\"\n volumes = [\n \"/:/rootfs:ro\",\n \"/var/run:/var/run:rw\",\n \"/sys:/sys:ro\",\n \"/var/lib/docker/:/var/lib/docker:ro\",\n \"/cgroup:/cgroup:ro\"\n ]\n }\n\n # The service stanza instructs Nomad to register a service with Consul.\n #\n # https://www.nomadproject.io/docs/job-specification/service\n #\n service {\n name = \"cadvisorexporter\"\n port = \"cadvisorexporter\"\n check {\n name = \"cAdvisor Check Live\"\n type = \"http\"\n path = \"/metrics\"\n interval = \"10s\"\n timeout = \"2s\"\n }\n }\n\n # The \"resources\" stanza describes the requirements a task needs to\n # execute. Resource requirements include memory, network, cpu, and more.\n # This ensures the task will execute on a machine that contains enough\n # resource capacity.\n #\n # https://www.nomadproject.io/docs/job-specification/resources\n #\n resources {\n cpu = 500\n # The network stanza specifies the networking requirements for the task\n # group, including the network mode and port allocations. When scheduling\n # jobs in Nomad they are provisioned across your fleet of machines along\n # with other jobs and services. Because you don't know in advance what host\n # your job will be provisioned on, Nomad will provide your tasks with\n # network configuration when they start up.\n #\n # https://www.nomadproject.io/docs/job-specification/network\n #\n network {\n port \"cadvisorexporter\" {\n static = 8080\n }\n }\n }\n }\n }\n}",
+ "json": null,
+ "modify_index": "7147927",
+ "name": "prod-exporter",
+ "namespace": "default",
+ "policy_override": null,
+ "purge_on_destroy": null,
+ "region": "global",
+ "task_groups": [
+ {
+ "count": 1,
+ "meta": {},
+ "name": "prod-group1-exporter-amd64",
+ "task": [
+ {
+ "driver": "raw_exec",
+ "meta": {},
+ "name": "prod-task1-nodeexporter-amd64",
+ "volume_mounts": []
+ },
+ {
+ "driver": "exec",
+ "meta": {},
+ "name": "prod-task2-blackboxexporter-amd64",
+ "volume_mounts": []
+ },
+ {
+ "driver": "docker",
+ "meta": {},
+ "name": "prod-task3-cadvisorexporter-amd64",
+ "volume_mounts": []
+ }
+ ],
+ "volumes": []
+ },
+ {
+ "count": 1,
+ "meta": {},
+ "name": "prod-group1-exporter-arm64",
+ "task": [
+ {
+ "driver": "raw_exec",
+ "meta": {},
+ "name": "prod-task1-nodeexporter-arm64",
+ "volume_mounts": []
+ },
+ {
+ "driver": "exec",
+ "meta": {},
+ "name": "prod-task2-blackboxexporter-arm64",
+ "volume_mounts": []
+ },
+ {
+ "driver": "docker",
+ "meta": {},
+ "name": "prod-task3-cadvisorexporter-arm64",
+ "volume_mounts": []
+ }
+ ],
+ "volumes": []
+ }
+ ],
+ "type": "system"
+ },
+ "sensitive_attributes": [],
+ "private": "bnVsbA==",
+ "dependencies": [
+ "module.exporter.data.template_file.nomad_job_exporter"
+ ]
+ }
+ ]
+ },
+ {
+ "module": "module.grafana",
+ "mode": "data",
+ "type": "template_file",
+ "name": "nomad_job_grafana",
+ "provider": "provider[\"registry.terraform.io/hashicorp/template\"]",
+ "instances": [
+ {
+ "schema_version": 0,
+ "attributes": {
+ "filename": null,
+ "id": "080a6dd88248c6a78ec3025a0ec9e471aed9acde495ca47db868b0893b4446b7",
+ "rendered": "job \"prod-grafana\" {\n # The \"region\" parameter specifies the region in which to execute the job.\n # If omitted, this inherits the default region name of \"global\".\n # region = \"global\"\n #\n # The \"datacenters\" parameter specifies the list of datacenters which should\n # be considered when placing this task. This must be provided.\n datacenters = \"yul1\"\n\n # The \"type\" parameter controls the type of job, which impacts the scheduler's\n # decision on placement. This configuration is optional and defaults to\n # \"service\". For a full list of job types and their differences, please see\n # the online documentation.\n #\n # For more information, please see the online documentation at:\n #\n # https://www.nomadproject.io/docs/jobspec/schedulers\n #\n type = \"service\"\n\n update {\n # The \"max_parallel\" parameter specifies the maximum number of updates to\n # perform in parallel. In this case, this specifies to update a single task\n # at a time.\n max_parallel = 1\n\n health_check = \"checks\"\n\n # The \"min_healthy_time\" parameter specifies the minimum time the allocation\n # must be in the healthy state before it is marked as healthy and unblocks\n # further allocations from being updated.\n min_healthy_time = \"10s\"\n\n # The \"healthy_deadline\" parameter specifies the deadline in which the\n # allocation must be marked as healthy after which the allocation is\n # automatically transitioned to unhealthy. Transitioning to unhealthy will\n # fail the deployment and potentially roll back the job if \"auto_revert\" is\n # set to true.\n healthy_deadline = \"3m\"\n\n # The \"progress_deadline\" parameter specifies the deadline in which an\n # allocation must be marked as healthy. The deadline begins when the first\n # allocation for the deployment is created and is reset whenever an allocation\n # as part of the deployment transitions to a healthy state. If no allocation\n # transitions to the healthy state before the progress deadline, the\n # deployment is marked as failed.\n progress_deadline = \"10m\"\n\n\n # The \"canary\" parameter specifies that changes to the job that would result\n # in destructive updates should create the specified number of canaries\n # without stopping any previous allocations. Once the operator determines the\n # canaries are healthy, they can be promoted which unblocks a rolling update\n # of the remaining allocations at a rate of \"max_parallel\".\n #\n # Further, setting \"canary\" equal to the count of the task group allows\n # blue/green deployments. When the job is updated, a full set of the new\n # version is deployed and upon promotion the old version is stopped.\n canary = 1\n\n # Specifies if the job should auto-promote to the canary version when all\n # canaries become healthy during a deployment. Defaults to false which means\n # canaries must be manually updated with the nomad deployment promote\n # command.\n auto_promote = true\n\n # The \"auto_revert\" parameter specifies if the job should auto-revert to the\n # last stable job on deployment failure. A job is marked as stable if all the\n # allocations as part of its deployment were marked healthy.\n auto_revert = true\n\n }\n\n # The \"group\" stanza defines a series of tasks that should be co-located on\n # the same Nomad client. Any task within a group will be placed on the same\n # client.\n #\n # For more information and examples on the \"group\" stanza, please see\n # the online documentation at:\n #\n # https://www.nomadproject.io/docs/job-specification/group\n #\n group \"prod-group1-grafana\" {\n # The \"count\" parameter specifies the number of the task groups that should\n # be running under this group. This value must be non-negative and defaults\n # to 1.\n count = 1\n\n\n # The constraint allows restricting the set of eligible nodes. Constraints\n # may filter on attributes or client metadata.\n #\n # For more information and examples on the \"volume\" stanza, please see\n # the online documentation at:\n #\n # https://www.nomadproject.io/docs/job-specification/constraint\n #\n constraint {\n attribute = \"${attr.cpu.arch}\"\n operator = \"!=\"\n value = \"arm64\"\n }\n\n # The \"task\" stanza creates an individual unit of work, such as a Docker\n # container, web application, or batch processing.\n #\n # For more information and examples on the \"task\" stanza, please see\n # the online documentation at:\n #\n # https://www.nomadproject.io/docs/job-specification/task\n #\n task \"prod-task1-grafana\" {\n # The \"driver\" parameter specifies the task driver that should be used to\n # run the task.\n driver = \"docker\"\n\n # The \"config\" stanza specifies the driver configuration, which is passed\n # directly to the driver to start the task. The details of configurations\n # are specific to each driver, so please see specific driver\n # documentation for more information.\n config {\n image = \"grafana/grafana:7.3.7\"\n dns_servers = [ \"${attr.unique.network.ip-address}\" ]\n volumes = [\n \"secrets/prometheus.yml:/etc/grafana/provisioning/datasources/prometheus.yml\",\n \"secrets/dashboards.yml:/etc/grafana/provisioning/dashboards/dashboards.yml\",\n \"secrets/grafana.ini:/etc/grafana/grafana.ini\",\n \"secrets/node_exporter.json:/etc/grafana/provisioning/dashboards/node_exporter.json\",\n \"secrets/docker_cadvisor.json:/etc/grafana/provisioning/dashboards/docker_cadvisor.json\",\n \"secrets/nomad.json:/etc/grafana/provisioning/dashboards/nomad.json\",\n \"secrets/consul.json:/etc/grafana/provisioning/dashboards/consul.json\",\n \"secrets/prometheus.json:/etc/grafana/provisioning/dashboards/prometheus.json\",\n \"secrets/blackbox_exporter_http.json:/etc/grafana/provisioning/dashboards/blackbox_exporter_http.json\",\n \"secrets/blackbox_exporter_icmp.json:/etc/grafana/provisioning/dashboards/blackbox_exporter_icmp.json\"\n ]\n }\n\n artifact {\n # Prometheus Node Exporter\n source = \"https://raw.githubusercontent.com/pmikus/grafana-dashboards/main/node_exporter.json\"\n destination = \"secrets/\"\n }\n\n artifact {\n # Docker cAdvisor\n source = \"https://raw.githubusercontent.com/pmikus/grafana-dashboards/main/docker_cadvisor.json\"\n destination = \"secrets/\"\n }\n\n artifact {\n # Nomad\n source = \"https://raw.githubusercontent.com/pmikus/grafana-dashboards/main/nomad.json\"\n destination = \"secrets/\"\n }\n\n artifact {\n # Consul\n source = \"https://raw.githubusercontent.com/pmikus/grafana-dashboards/main/consul.json\"\n destination = \"secrets/\"\n }\n\n artifact {\n # Prometheus\n source = \"https://raw.githubusercontent.com/pmikus/grafana-dashboards/main/prometheus.json\"\n destination = \"secrets/\"\n }\n\n artifact {\n # Prometheus Blackbox Exporter ICMP\n source = \"https://raw.githubusercontent.com/pmikus/grafana-dashboards/main/blackbox_exporter_icmp.json\"\n destination = \"secrets/\"\n }\n\n artifact {\n # Prometheus Blackbox Exporter HTTP\n source = \"https://raw.githubusercontent.com/pmikus/grafana-dashboards/main/blackbox_exporter_http.json\"\n destination = \"secrets/\"\n }\n\n # The \"template\" stanza instructs Nomad to manage a template, such as\n # a configuration file or script. This template can optionally pull data\n # from Consul or Vault to populate runtime configuration data.\n #\n # For more information and examples on the \"template\" stanza, please see\n # the online documentation at:\n #\n # https://www.nomadproject.io/docs/job-specification/template\n #\n template {\n change_mode = \"noop\"\n change_signal = \"SIGINT\"\n destination = \"secrets/prometheus.yml\"\n data = \u003c\u003cEOH\napiVersion: 1\ndatasources:\n- name: Prometheus\n type: prometheus\n access: direct\n orgId: 1\n url: http://prometheus.service.consul:9090\n basicAuth: false\n isDefault: true\n version: 1\n editable: false\nEOH\n }\n\n template {\n change_mode = \"noop\"\n change_signal = \"SIGINT\"\n destination = \"secrets/dashboards.yml\"\n data = \u003c\u003cEOH\napiVersion: 1\nproviders:\n- name: dashboards\n type: file\n disableDeletion: false\n updateIntervalSeconds: 10\n allowUiUpdates: false\n options:\n path: /etc/grafana/provisioning/dashboards\n foldersFromFilesStructure: true\nEOH\n }\n\n template {\n change_mode = \"noop\"\n change_signal = \"SIGINT\"\n destination = \"secrets/grafana.ini\"\n data = \u003c\u003cEOH\napp_mode = production\n\n[metrics]\nenabled = true\n\n[server]\nprotocol = http\nhttp_port = 3000\nroot_url = http://grafana.service.consul:3000\nenable_gzip = true\n;cert_file =\n;cert_key =\n\n[security]\nadmin_user = grafanauser\nadmin_password = Grafana1234\nsecret_key = SW2YcwTIb9zpOOhoPsMm\n\n[users]\nallow_sign_up = false\nallow_org_create = false\nauto_assign_org = true\nauto_assign_org_role = Viewer\ndefault_theme = dark\n\n[auth.basic]\nenabled = true\n\n[auth]\ndisable_login_form = false\ndisable_signout_menu = false\n\n[auth.anonymous]\nenabled = false\n\n[log]\nmode = console\nlevel = info\n\n[log.console]\nlevel = info\nformat = console\nEOH\n }\n\n # The service stanza instructs Nomad to register a service with Consul.\n #\n # For more information and examples on the \"task\" stanza, please see\n # the online documentation at:\n #\n # https://www.nomadproject.io/docs/job-specification/service\n #\n service {\n name = \"grafana\"\n port = \"grafana\"\n tags = [ \"grafana${NOMAD_ALLOC_INDEX}\" ]\n check {\n name = \"Grafana Check Live\"\n type = \"http\"\n protocol = \"http\"\n tls_skip_verify = true\n path = \"/api/health\"\n interval = \"10s\"\n timeout = \"2s\"\n }\n }\n\n # The \"resources\" stanza describes the requirements a task needs to\n # execute. Resource requirements include memory, network, cpu, and more.\n # This ensures the task will execute on a machine that contains enough\n # resource capacity.\n #\n # For more information and examples on the \"resources\" stanza, please see\n # the online documentation at:\n #\n # https://www.nomadproject.io/docs/job-specification/resources\n #\n resources {\n cpu = 2000\n memory = 2048\n # The network stanza specifies the networking requirements for the task\n # group, including the network mode and port allocations. When scheduling\n # jobs in Nomad they are provisioned across your fleet of machines along\n # with other jobs and services. Because you don't know in advance what host\n # your job will be provisioned on, Nomad will provide your tasks with\n # network configuration when they start up.\n #\n # For more information and examples on the \"template\" stanza, please see\n # the online documentation at:\n #\n # https://www.nomadproject.io/docs/job-specification/network\n #\n network {\n port \"grafana\" {\n static = 3000\n }\n }\n }\n }\n }\n}",
+ "template": "job \"${job_name}\" {\n # The \"region\" parameter specifies the region in which to execute the job.\n # If omitted, this inherits the default region name of \"global\".\n # region = \"global\"\n #\n # The \"datacenters\" parameter specifies the list of datacenters which should\n # be considered when placing this task. This must be provided.\n datacenters = \"${datacenters}\"\n\n # The \"type\" parameter controls the type of job, which impacts the scheduler's\n # decision on placement. This configuration is optional and defaults to\n # \"service\". For a full list of job types and their differences, please see\n # the online documentation.\n #\n # For more information, please see the online documentation at:\n #\n # https://www.nomadproject.io/docs/jobspec/schedulers\n #\n type = \"service\"\n\n update {\n # The \"max_parallel\" parameter specifies the maximum number of updates to\n # perform in parallel. In this case, this specifies to update a single task\n # at a time.\n max_parallel = 1\n\n health_check = \"checks\"\n\n # The \"min_healthy_time\" parameter specifies the minimum time the allocation\n # must be in the healthy state before it is marked as healthy and unblocks\n # further allocations from being updated.\n min_healthy_time = \"10s\"\n\n # The \"healthy_deadline\" parameter specifies the deadline in which the\n # allocation must be marked as healthy after which the allocation is\n # automatically transitioned to unhealthy. Transitioning to unhealthy will\n # fail the deployment and potentially roll back the job if \"auto_revert\" is\n # set to true.\n healthy_deadline = \"3m\"\n\n # The \"progress_deadline\" parameter specifies the deadline in which an\n # allocation must be marked as healthy. The deadline begins when the first\n # allocation for the deployment is created and is reset whenever an allocation\n # as part of the deployment transitions to a healthy state. If no allocation\n # transitions to the healthy state before the progress deadline, the\n # deployment is marked as failed.\n progress_deadline = \"10m\"\n\n%{ if use_canary }\n # The \"canary\" parameter specifies that changes to the job that would result\n # in destructive updates should create the specified number of canaries\n # without stopping any previous allocations. Once the operator determines the\n # canaries are healthy, they can be promoted which unblocks a rolling update\n # of the remaining allocations at a rate of \"max_parallel\".\n #\n # Further, setting \"canary\" equal to the count of the task group allows\n # blue/green deployments. When the job is updated, a full set of the new\n # version is deployed and upon promotion the old version is stopped.\n canary = 1\n\n # Specifies if the job should auto-promote to the canary version when all\n # canaries become healthy during a deployment. Defaults to false which means\n # canaries must be manually updated with the nomad deployment promote\n # command.\n auto_promote = true\n\n # The \"auto_revert\" parameter specifies if the job should auto-revert to the\n # last stable job on deployment failure. A job is marked as stable if all the\n # allocations as part of its deployment were marked healthy.\n auto_revert = true\n%{ endif }\n }\n\n # The \"group\" stanza defines a series of tasks that should be co-located on\n # the same Nomad client. Any task within a group will be placed on the same\n # client.\n #\n # For more information and examples on the \"group\" stanza, please see\n # the online documentation at:\n #\n # https://www.nomadproject.io/docs/job-specification/group\n #\n group \"prod-group1-${service_name}\" {\n # The \"count\" parameter specifies the number of the task groups that should\n # be running under this group. This value must be non-negative and defaults\n # to 1.\n count = ${group_count}\n\n\n # The constraint allows restricting the set of eligible nodes. Constraints\n # may filter on attributes or client metadata.\n #\n # For more information and examples on the \"volume\" stanza, please see\n # the online documentation at:\n #\n # https://www.nomadproject.io/docs/job-specification/constraint\n #\n constraint {\n attribute = \"$${attr.cpu.arch}\"\n operator = \"!=\"\n value = \"arm64\"\n }\n\n # The \"task\" stanza creates an individual unit of work, such as a Docker\n # container, web application, or batch processing.\n #\n # For more information and examples on the \"task\" stanza, please see\n # the online documentation at:\n #\n # https://www.nomadproject.io/docs/job-specification/task\n #\n task \"prod-task1-${service_name}\" {\n # The \"driver\" parameter specifies the task driver that should be used to\n # run the task.\n driver = \"docker\"\n\n # The \"config\" stanza specifies the driver configuration, which is passed\n # directly to the driver to start the task. The details of configurations\n # are specific to each driver, so please see specific driver\n # documentation for more information.\n config {\n image = \"${image}\"\n dns_servers = [ \"$${attr.unique.network.ip-address}\" ]\n volumes = [\n \"secrets/prometheus.yml:/etc/grafana/provisioning/datasources/prometheus.yml\",\n \"secrets/dashboards.yml:/etc/grafana/provisioning/dashboards/dashboards.yml\",\n \"secrets/grafana.ini:/etc/grafana/grafana.ini\",\n \"secrets/node_exporter.json:/etc/grafana/provisioning/dashboards/node_exporter.json\",\n \"secrets/docker_cadvisor.json:/etc/grafana/provisioning/dashboards/docker_cadvisor.json\",\n \"secrets/nomad.json:/etc/grafana/provisioning/dashboards/nomad.json\",\n \"secrets/consul.json:/etc/grafana/provisioning/dashboards/consul.json\",\n \"secrets/prometheus.json:/etc/grafana/provisioning/dashboards/prometheus.json\",\n \"secrets/blackbox_exporter_http.json:/etc/grafana/provisioning/dashboards/blackbox_exporter_http.json\",\n \"secrets/blackbox_exporter_icmp.json:/etc/grafana/provisioning/dashboards/blackbox_exporter_icmp.json\"\n ]\n }\n\n artifact {\n # Prometheus Node Exporter\n source = \"https://raw.githubusercontent.com/pmikus/grafana-dashboards/main/node_exporter.json\"\n destination = \"secrets/\"\n }\n\n artifact {\n # Docker cAdvisor\n source = \"https://raw.githubusercontent.com/pmikus/grafana-dashboards/main/docker_cadvisor.json\"\n destination = \"secrets/\"\n }\n\n artifact {\n # Nomad\n source = \"https://raw.githubusercontent.com/pmikus/grafana-dashboards/main/nomad.json\"\n destination = \"secrets/\"\n }\n\n artifact {\n # Consul\n source = \"https://raw.githubusercontent.com/pmikus/grafana-dashboards/main/consul.json\"\n destination = \"secrets/\"\n }\n\n artifact {\n # Prometheus\n source = \"https://raw.githubusercontent.com/pmikus/grafana-dashboards/main/prometheus.json\"\n destination = \"secrets/\"\n }\n\n artifact {\n # Prometheus Blackbox Exporter ICMP\n source = \"https://raw.githubusercontent.com/pmikus/grafana-dashboards/main/blackbox_exporter_icmp.json\"\n destination = \"secrets/\"\n }\n\n artifact {\n # Prometheus Blackbox Exporter HTTP\n source = \"https://raw.githubusercontent.com/pmikus/grafana-dashboards/main/blackbox_exporter_http.json\"\n destination = \"secrets/\"\n }\n\n # The \"template\" stanza instructs Nomad to manage a template, such as\n # a configuration file or script. This template can optionally pull data\n # from Consul or Vault to populate runtime configuration data.\n #\n # For more information and examples on the \"template\" stanza, please see\n # the online documentation at:\n #\n # https://www.nomadproject.io/docs/job-specification/template\n #\n template {\n change_mode = \"noop\"\n change_signal = \"SIGINT\"\n destination = \"secrets/prometheus.yml\"\n data = \u003c\u003cEOH\napiVersion: 1\ndatasources:\n- name: Prometheus\n type: prometheus\n access: direct\n orgId: 1\n url: http://prometheus.service.consul:9090\n basicAuth: false\n isDefault: true\n version: 1\n editable: false\nEOH\n }\n\n template {\n change_mode = \"noop\"\n change_signal = \"SIGINT\"\n destination = \"secrets/dashboards.yml\"\n data = \u003c\u003cEOH\napiVersion: 1\nproviders:\n- name: dashboards\n type: file\n disableDeletion: false\n updateIntervalSeconds: 10\n allowUiUpdates: false\n options:\n path: /etc/grafana/provisioning/dashboards\n foldersFromFilesStructure: true\nEOH\n }\n\n template {\n change_mode = \"noop\"\n change_signal = \"SIGINT\"\n destination = \"secrets/grafana.ini\"\n data = \u003c\u003cEOH\napp_mode = production\n\n[metrics]\nenabled = true\n\n[server]\nprotocol = http\nhttp_port = ${port}\nroot_url = http://${service_name}.service.consul:${port}\nenable_gzip = true\n;cert_file =\n;cert_key =\n\n[security]\nadmin_user = grafanauser\nadmin_password = Grafana1234\nsecret_key = SW2YcwTIb9zpOOhoPsMm\n\n[users]\nallow_sign_up = false\nallow_org_create = false\nauto_assign_org = true\nauto_assign_org_role = Viewer\ndefault_theme = dark\n\n[auth.basic]\nenabled = true\n\n[auth]\ndisable_login_form = false\ndisable_signout_menu = false\n\n[auth.anonymous]\nenabled = false\n\n[log]\nmode = console\nlevel = info\n\n[log.console]\nlevel = info\nformat = console\nEOH\n }\n\n # The service stanza instructs Nomad to register a service with Consul.\n #\n # For more information and examples on the \"task\" stanza, please see\n # the online documentation at:\n #\n # https://www.nomadproject.io/docs/job-specification/service\n #\n service {\n name = \"${service_name}\"\n port = \"${service_name}\"\n tags = [ \"${service_name}$${NOMAD_ALLOC_INDEX}\" ]\n check {\n name = \"Grafana Check Live\"\n type = \"http\"\n protocol = \"http\"\n tls_skip_verify = true\n path = \"/api/health\"\n interval = \"10s\"\n timeout = \"2s\"\n }\n }\n\n # The \"resources\" stanza describes the requirements a task needs to\n # execute. Resource requirements include memory, network, cpu, and more.\n # This ensures the task will execute on a machine that contains enough\n # resource capacity.\n #\n # For more information and examples on the \"resources\" stanza, please see\n # the online documentation at:\n #\n # https://www.nomadproject.io/docs/job-specification/resources\n #\n resources {\n cpu = ${cpu}\n memory = ${mem}\n # The network stanza specifies the networking requirements for the task\n # group, including the network mode and port allocations. When scheduling\n # jobs in Nomad they are provisioned across your fleet of machines along\n # with other jobs and services. Because you don't know in advance what host\n # your job will be provisioned on, Nomad will provide your tasks with\n # network configuration when they start up.\n #\n # For more information and examples on the \"template\" stanza, please see\n # the online documentation at:\n #\n # https://www.nomadproject.io/docs/job-specification/network\n #\n network {\n port \"${service_name}\" {\n static = ${port}\n }\n }\n }\n }\n }\n}",
+ "vars": {
+ "cpu": "2000",
+ "datacenters": "yul1",
+ "group_count": "1",
+ "image": "grafana/grafana:7.3.7",
+ "job_name": "prod-grafana",
+ "mem": "2048",
+ "port": "3000",
+ "service_name": "grafana",
+ "use_canary": "true",
+ "use_vault_provider": "false"
+ }
+ },
+ "sensitive_attributes": []
+ }
+ ]
+ },
+ {
+ "module": "module.grafana",
+ "mode": "managed",
+ "type": "nomad_job",
+ "name": "nomad_job_grafana",
+ "provider": "provider[\"registry.terraform.io/hashicorp/nomad\"].yul1",
+ "instances": [
+ {
+ "schema_version": 0,
+ "attributes": {
+ "allocation_ids": [
+ "1cf5f45a-3d10-b433-600e-f92696e49a25",
+ "d2ddca98-1bb8-680d-06f1-00fcf4887eca"
+ ],
+ "datacenters": [
+ "yul1"
+ ],
+ "deployment_id": "fef8bdc5-f769-c504-f089-8886755e45ef",
+ "deployment_status": "successful",
+ "deregister_on_destroy": true,
+ "deregister_on_id_change": true,
+ "detach": false,
+ "id": "prod-grafana",
+ "jobspec": "job \"prod-grafana\" {\n # The \"region\" parameter specifies the region in which to execute the job.\n # If omitted, this inherits the default region name of \"global\".\n # region = \"global\"\n #\n # The \"datacenters\" parameter specifies the list of datacenters which should\n # be considered when placing this task. This must be provided.\n datacenters = \"yul1\"\n\n # The \"type\" parameter controls the type of job, which impacts the scheduler's\n # decision on placement. This configuration is optional and defaults to\n # \"service\". For a full list of job types and their differences, please see\n # the online documentation.\n #\n # For more information, please see the online documentation at:\n #\n # https://www.nomadproject.io/docs/jobspec/schedulers\n #\n type = \"service\"\n\n update {\n # The \"max_parallel\" parameter specifies the maximum number of updates to\n # perform in parallel. In this case, this specifies to update a single task\n # at a time.\n max_parallel = 1\n\n health_check = \"checks\"\n\n # The \"min_healthy_time\" parameter specifies the minimum time the allocation\n # must be in the healthy state before it is marked as healthy and unblocks\n # further allocations from being updated.\n min_healthy_time = \"10s\"\n\n # The \"healthy_deadline\" parameter specifies the deadline in which the\n # allocation must be marked as healthy after which the allocation is\n # automatically transitioned to unhealthy. Transitioning to unhealthy will\n # fail the deployment and potentially roll back the job if \"auto_revert\" is\n # set to true.\n healthy_deadline = \"3m\"\n\n # The \"progress_deadline\" parameter specifies the deadline in which an\n # allocation must be marked as healthy. The deadline begins when the first\n # allocation for the deployment is created and is reset whenever an allocation\n # as part of the deployment transitions to a healthy state. If no allocation\n # transitions to the healthy state before the progress deadline, the\n # deployment is marked as failed.\n progress_deadline = \"10m\"\n\n\n # The \"canary\" parameter specifies that changes to the job that would result\n # in destructive updates should create the specified number of canaries\n # without stopping any previous allocations. Once the operator determines the\n # canaries are healthy, they can be promoted which unblocks a rolling update\n # of the remaining allocations at a rate of \"max_parallel\".\n #\n # Further, setting \"canary\" equal to the count of the task group allows\n # blue/green deployments. When the job is updated, a full set of the new\n # version is deployed and upon promotion the old version is stopped.\n canary = 1\n\n # Specifies if the job should auto-promote to the canary version when all\n # canaries become healthy during a deployment. Defaults to false which means\n # canaries must be manually updated with the nomad deployment promote\n # command.\n auto_promote = true\n\n # The \"auto_revert\" parameter specifies if the job should auto-revert to the\n # last stable job on deployment failure. A job is marked as stable if all the\n # allocations as part of its deployment were marked healthy.\n auto_revert = true\n\n }\n\n # The \"group\" stanza defines a series of tasks that should be co-located on\n # the same Nomad client. Any task within a group will be placed on the same\n # client.\n #\n # For more information and examples on the \"group\" stanza, please see\n # the online documentation at:\n #\n # https://www.nomadproject.io/docs/job-specification/group\n #\n group \"prod-group1-grafana\" {\n # The \"count\" parameter specifies the number of the task groups that should\n # be running under this group. This value must be non-negative and defaults\n # to 1.\n count = 1\n\n\n # The constraint allows restricting the set of eligible nodes. Constraints\n # may filter on attributes or client metadata.\n #\n # For more information and examples on the \"volume\" stanza, please see\n # the online documentation at:\n #\n # https://www.nomadproject.io/docs/job-specification/constraint\n #\n constraint {\n attribute = \"${attr.cpu.arch}\"\n operator = \"!=\"\n value = \"arm64\"\n }\n\n # The \"task\" stanza creates an individual unit of work, such as a Docker\n # container, web application, or batch processing.\n #\n # For more information and examples on the \"task\" stanza, please see\n # the online documentation at:\n #\n # https://www.nomadproject.io/docs/job-specification/task\n #\n task \"prod-task1-grafana\" {\n # The \"driver\" parameter specifies the task driver that should be used to\n # run the task.\n driver = \"docker\"\n\n # The \"config\" stanza specifies the driver configuration, which is passed\n # directly to the driver to start the task. The details of configurations\n # are specific to each driver, so please see specific driver\n # documentation for more information.\n config {\n image = \"grafana/grafana:7.3.7\"\n dns_servers = [ \"${attr.unique.network.ip-address}\" ]\n volumes = [\n \"secrets/prometheus.yml:/etc/grafana/provisioning/datasources/prometheus.yml\",\n \"secrets/dashboards.yml:/etc/grafana/provisioning/dashboards/dashboards.yml\",\n \"secrets/grafana.ini:/etc/grafana/grafana.ini\",\n \"secrets/node_exporter.json:/etc/grafana/provisioning/dashboards/node_exporter.json\",\n \"secrets/docker_cadvisor.json:/etc/grafana/provisioning/dashboards/docker_cadvisor.json\",\n \"secrets/nomad.json:/etc/grafana/provisioning/dashboards/nomad.json\",\n \"secrets/consul.json:/etc/grafana/provisioning/dashboards/consul.json\",\n \"secrets/prometheus.json:/etc/grafana/provisioning/dashboards/prometheus.json\",\n \"secrets/blackbox_exporter_http.json:/etc/grafana/provisioning/dashboards/blackbox_exporter_http.json\",\n \"secrets/blackbox_exporter_icmp.json:/etc/grafana/provisioning/dashboards/blackbox_exporter_icmp.json\"\n ]\n }\n\n artifact {\n # Prometheus Node Exporter\n source = \"https://raw.githubusercontent.com/pmikus/grafana-dashboards/main/node_exporter.json\"\n destination = \"secrets/\"\n }\n\n artifact {\n # Docker cAdvisor\n source = \"https://raw.githubusercontent.com/pmikus/grafana-dashboards/main/docker_cadvisor.json\"\n destination = \"secrets/\"\n }\n\n artifact {\n # Nomad\n source = \"https://raw.githubusercontent.com/pmikus/grafana-dashboards/main/nomad.json\"\n destination = \"secrets/\"\n }\n\n artifact {\n # Consul\n source = \"https://raw.githubusercontent.com/pmikus/grafana-dashboards/main/consul.json\"\n destination = \"secrets/\"\n }\n\n artifact {\n # Prometheus\n source = \"https://raw.githubusercontent.com/pmikus/grafana-dashboards/main/prometheus.json\"\n destination = \"secrets/\"\n }\n\n artifact {\n # Prometheus Blackbox Exporter ICMP\n source = \"https://raw.githubusercontent.com/pmikus/grafana-dashboards/main/blackbox_exporter_icmp.json\"\n destination = \"secrets/\"\n }\n\n artifact {\n # Prometheus Blackbox Exporter HTTP\n source = \"https://raw.githubusercontent.com/pmikus/grafana-dashboards/main/blackbox_exporter_http.json\"\n destination = \"secrets/\"\n }\n\n # The \"template\" stanza instructs Nomad to manage a template, such as\n # a configuration file or script. This template can optionally pull data\n # from Consul or Vault to populate runtime configuration data.\n #\n # For more information and examples on the \"template\" stanza, please see\n # the online documentation at:\n #\n # https://www.nomadproject.io/docs/job-specification/template\n #\n template {\n change_mode = \"noop\"\n change_signal = \"SIGINT\"\n destination = \"secrets/prometheus.yml\"\n data = \u003c\u003cEOH\napiVersion: 1\ndatasources:\n- name: Prometheus\n type: prometheus\n access: direct\n orgId: 1\n url: http://prometheus.service.consul:9090\n basicAuth: false\n isDefault: true\n version: 1\n editable: false\nEOH\n }\n\n template {\n change_mode = \"noop\"\n change_signal = \"SIGINT\"\n destination = \"secrets/dashboards.yml\"\n data = \u003c\u003cEOH\napiVersion: 1\nproviders:\n- name: dashboards\n type: file\n disableDeletion: false\n updateIntervalSeconds: 10\n allowUiUpdates: false\n options:\n path: /etc/grafana/provisioning/dashboards\n foldersFromFilesStructure: true\nEOH\n }\n\n template {\n change_mode = \"noop\"\n change_signal = \"SIGINT\"\n destination = \"secrets/grafana.ini\"\n data = \u003c\u003cEOH\napp_mode = production\n\n[metrics]\nenabled = true\n\n[server]\nprotocol = http\nhttp_port = 3000\nroot_url = http://grafana.service.consul:3000\nenable_gzip = true\n;cert_file =\n;cert_key =\n\n[security]\nadmin_user = grafanauser\nadmin_password = Grafana1234\nsecret_key = SW2YcwTIb9zpOOhoPsMm\n\n[users]\nallow_sign_up = false\nallow_org_create = false\nauto_assign_org = true\nauto_assign_org_role = Viewer\ndefault_theme = dark\n\n[auth.basic]\nenabled = true\n\n[auth]\ndisable_login_form = false\ndisable_signout_menu = false\n\n[auth.anonymous]\nenabled = false\n\n[log]\nmode = console\nlevel = info\n\n[log.console]\nlevel = info\nformat = console\nEOH\n }\n\n # The service stanza instructs Nomad to register a service with Consul.\n #\n # For more information and examples on the \"task\" stanza, please see\n # the online documentation at:\n #\n # https://www.nomadproject.io/docs/job-specification/service\n #\n service {\n name = \"grafana\"\n port = \"grafana\"\n tags = [ \"grafana${NOMAD_ALLOC_INDEX}\" ]\n check {\n name = \"Grafana Check Live\"\n type = \"http\"\n protocol = \"http\"\n tls_skip_verify = true\n path = \"/api/health\"\n interval = \"10s\"\n timeout = \"2s\"\n }\n }\n\n # The \"resources\" stanza describes the requirements a task needs to\n # execute. Resource requirements include memory, network, cpu, and more.\n # This ensures the task will execute on a machine that contains enough\n # resource capacity.\n #\n # For more information and examples on the \"resources\" stanza, please see\n # the online documentation at:\n #\n # https://www.nomadproject.io/docs/job-specification/resources\n #\n resources {\n cpu = 2000\n memory = 2048\n # The network stanza specifies the networking requirements for the task\n # group, including the network mode and port allocations. When scheduling\n # jobs in Nomad they are provisioned across your fleet of machines along\n # with other jobs and services. Because you don't know in advance what host\n # your job will be provisioned on, Nomad will provide your tasks with\n # network configuration when they start up.\n #\n # For more information and examples on the \"template\" stanza, please see\n # the online documentation at:\n #\n # https://www.nomadproject.io/docs/job-specification/network\n #\n network {\n port \"grafana\" {\n static = 3000\n }\n }\n }\n }\n }\n}",
+ "json": null,
+ "modify_index": "7148410",
+ "name": "prod-grafana",
+ "namespace": "default",
+ "policy_override": null,
+ "purge_on_destroy": null,
+ "region": "global",
+ "task_groups": [
+ {
+ "count": 1,
+ "meta": {},
+ "name": "prod-group1-grafana",
+ "task": [
+ {
+ "driver": "docker",
+ "meta": {},
+ "name": "prod-task1-grafana",
+ "volume_mounts": []
+ }
+ ],
+ "volumes": []
+ }
+ ],
+ "type": "service"
+ },
+ "sensitive_attributes": [],
+ "private": "bnVsbA==",
+ "dependencies": [
+ "module.grafana.data.template_file.nomad_job_grafana"
+ ]
+ }
+ ]
+ },
+ {
+ "module": "module.minio",
+ "mode": "data",
+ "type": "template_file",
+ "name": "nomad_job_mc",
+ "provider": "provider[\"registry.terraform.io/hashicorp/template\"]",
+ "instances": [
+ {
+ "schema_version": 0,
+ "attributes": {
+ "filename": null,
+ "id": "ca0c95bbe91c4ac393d5cd5bef8efb90fb5f176f266ba233542fd4338b51c6cc",
+ "rendered": "job \"prod-mc\" {\n # The \"region\" parameter specifies the region in which to execute the job.\n # If omitted, this inherits the default region name of \"global\".\n # region = \"global\"\n #\n # The \"datacenters\" parameter specifies the list of datacenters which should\n # be considered when placing this task. This must be provided.\n datacenters = \"yul1\"\n\n # The \"type\" parameter controls the type of job, which impacts the scheduler's\n # decision on placement. This configuration is optional and defaults to\n # \"service\". For a full list of job types and their differences, please see\n # the online documentation.\n #\n # For more information, please see the online documentation at:\n #\n # https://www.nomadproject.io/docs/jobspec/schedulers.html\n #\n type = \"batch\"\n\n # The \"group\" stanza defines a series of tasks that should be co-located on\n # the same Nomad client. Any task within a group will be placed on the same\n # client.\n #\n # For more information and examples on the \"group\" stanza, please see\n # the online documentation at:\n #\n # https://www.nomadproject.io/docs/job-specification/group.html\n #\n group \"prod-group1-mc\" {\n task \"prod-task1-create-buckets\" {\n # The \"driver\" parameter specifies the task driver that should be used to\n # run the task.\n driver = \"docker\"\n\n \n\n # The \"config\" stanza specifies the driver configuration, which is passed\n # directly to the driver to start the task. The details of configurations\n # are specific to each driver, so please see specific driver\n # documentation for more information.\n config {\n image = \"minio/mc:RELEASE.2020-12-10T01-26-17Z\"\n entrypoint = [\n \"/bin/sh\",\n \"-c\",\n \"mc config host add LOCALMINIO http://storage.service.consul:9000 $MINIO_ACCESS_KEY $MINIO_SECRET_KEY \u0026\u0026 mc mb -p LOCALMINIO/logs.fd.io LOCALMINIO/docs.fd.io ; mc policy set public LOCALMINIO/logs.fd.io mc policy set public LOCALMINIO/docs.fd.io mc ilm add --expiry-days '180' LOCALMINIO/logs.fd.io mc admin user add LOCALMINIO storage Storage1234 mc admin policy set LOCALMINIO writeonly user=storage\"\n ]\n dns_servers = [ \"${attr.unique.network.ip-address}\" ]\n privileged = false\n }\n\n # The env stanza configures a list of environment variables to populate\n # the task's environment before starting.\n env {\n \n MINIO_ACCESS_KEY = \"minio\"\n MINIO_SECRET_KEY = \"minio123\"\n \n \n }\n }\n }\n}\n",
+ "template": "job \"${job_name}\" {\n # The \"region\" parameter specifies the region in which to execute the job.\n # If omitted, this inherits the default region name of \"global\".\n # region = \"global\"\n #\n # The \"datacenters\" parameter specifies the list of datacenters which should\n # be considered when placing this task. This must be provided.\n datacenters = \"${datacenters}\"\n\n # The \"type\" parameter controls the type of job, which impacts the scheduler's\n # decision on placement. This configuration is optional and defaults to\n # \"service\". For a full list of job types and their differences, please see\n # the online documentation.\n #\n # For more information, please see the online documentation at:\n #\n # https://www.nomadproject.io/docs/jobspec/schedulers.html\n #\n type = \"batch\"\n\n # The \"group\" stanza defines a series of tasks that should be co-located on\n # the same Nomad client. Any task within a group will be placed on the same\n # client.\n #\n # For more information and examples on the \"group\" stanza, please see\n # the online documentation at:\n #\n # https://www.nomadproject.io/docs/job-specification/group.html\n #\n group \"prod-group1-mc\" {\n task \"prod-task1-create-buckets\" {\n # The \"driver\" parameter specifies the task driver that should be used to\n # run the task.\n driver = \"docker\"\n\n %{ if use_vault_provider }\n vault {\n policies = \"${vault_kv_policy_name}\"\n }\n %{ endif }\n\n # The \"config\" stanza specifies the driver configuration, which is passed\n # directly to the driver to start the task. The details of configurations\n # are specific to each driver, so please see specific driver\n # documentation for more information.\n config {\n image = \"${image}\"\n entrypoint = [\n \"/bin/sh\",\n \"-c\",\n \"${command}\"\n ]\n dns_servers = [ \"$${attr.unique.network.ip-address}\" ]\n privileged = false\n }\n\n # The env stanza configures a list of environment variables to populate\n # the task's environment before starting.\n env {\n %{ if use_vault_provider }\n {{ with secret \"${vault_kv_path}\" }}\n MINIO_ACCESS_KEY = \"{{ .Data.data.${vault_kv_field_access_key} }}\"\n MINIO_SECRET_KEY = \"{{ .Data.data.${vault_kv_field_secret_key} }}\"\n {{ end }}\n %{ else }\n MINIO_ACCESS_KEY = \"${access_key}\"\n MINIO_SECRET_KEY = \"${secret_key}\"\n %{ endif }\n ${ envs }\n }\n }\n }\n}\n",
+ "vars": {
+ "access_key": "minio",
+ "command": "mc config host add LOCALMINIO http://storage.service.consul:9000 $MINIO_ACCESS_KEY $MINIO_SECRET_KEY \u0026\u0026 mc mb -p LOCALMINIO/logs.fd.io LOCALMINIO/docs.fd.io ; mc policy set public LOCALMINIO/logs.fd.io mc policy set public LOCALMINIO/docs.fd.io mc ilm add --expiry-days '180' LOCALMINIO/logs.fd.io mc admin user add LOCALMINIO storage Storage1234 mc admin policy set LOCALMINIO writeonly user=storage",
+ "datacenters": "yul1",
+ "envs": "",
+ "image": "minio/mc:RELEASE.2020-12-10T01-26-17Z",
+ "job_name": "prod-mc",
+ "minio_port": "9000",
+ "minio_service_name": "storage",
+ "secret_key": "minio123",
+ "service_name": "mc",
+ "use_vault_provider": "false"
+ }
+ },
+ "sensitive_attributes": []
+ }
+ ]
+ },
+ {
+ "module": "module.minio",
+ "mode": "data",
+ "type": "template_file",
+ "name": "nomad_job_minio",
+ "provider": "provider[\"registry.terraform.io/hashicorp/template\"]",
+ "instances": [
+ {
+ "schema_version": 0,
+ "attributes": {
+ "filename": null,
+ "id": "c38a0b7182d39c70ec07bdaa998f57a215d7193b264c1b1c1a299c84e7ca53de",
+ "rendered": "job \"prod-minio\" {\n # The \"region\" parameter specifies the region in which to execute the job.\n # If omitted, this inherits the default region name of \"global\".\n # region = \"global\"\n #\n # The \"datacenters\" parameter specifies the list of datacenters which should\n # be considered when placing this task. This must be provided.\n datacenters = \"yul1\"\n\n # The \"type\" parameter controls the type of job, which impacts the scheduler's\n # decision on placement. This configuration is optional and defaults to\n # \"service\". For a full list of job types and their differences, please see\n # the online documentation.\n #\n # For more information, please see the online documentation at:\n #\n # https://www.nomadproject.io/docs/jobspec/schedulers.html\n #\n type = \"service\"\n\n update {\n # The \"max_parallel\" parameter specifies the maximum number of updates to\n # perform in parallel. In this case, this specifies to update a single task\n # at a time.\n max_parallel = 1\n\n health_check = \"checks\"\n\n # The \"min_healthy_time\" parameter specifies the minimum time the allocation\n # must be in the healthy state before it is marked as healthy and unblocks\n # further allocations from being updated.\n min_healthy_time = \"10s\"\n\n # The \"healthy_deadline\" parameter specifies the deadline in which the\n # allocation must be marked as healthy after which the allocation is\n # automatically transitioned to unhealthy. Transitioning to unhealthy will\n # fail the deployment and potentially roll back the job if \"auto_revert\" is\n # set to true.\n healthy_deadline = \"3m\"\n\n # The \"progress_deadline\" parameter specifies the deadline in which an\n # allocation must be marked as healthy. The deadline begins when the first\n # allocation for the deployment is created and is reset whenever an allocation\n # as part of the deployment transitions to a healthy state. If no allocation\n # transitions to the healthy state before the progress deadline, the\n # deployment is marked as failed.\n progress_deadline = \"10m\"\n\n\n # The \"canary\" parameter specifies that changes to the job that would result\n # in destructive updates should create the specified number of canaries\n # without stopping any previous allocations. Once the operator determines the\n # canaries are healthy, they can be promoted which unblocks a rolling update\n # of the remaining allocations at a rate of \"max_parallel\".\n #\n # Further, setting \"canary\" equal to the count of the task group allows\n # blue/green deployments. When the job is updated, a full set of the new\n # version is deployed and upon promotion the old version is stopped.\n canary = 1\n\n # Specifies if the job should auto-promote to the canary version when all\n # canaries become healthy during a deployment. Defaults to false which means\n # canaries must be manually updated with the nomad deployment promote\n # command.\n auto_promote = true\n\n # The \"auto_revert\" parameter specifies if the job should auto-revert to the\n # last stable job on deployment failure. A job is marked as stable if all the\n # allocations as part of its deployment were marked healthy.\n auto_revert = true\n\n }\n\n # All groups in this job should be scheduled on different hosts.\n constraint {\n operator = \"distinct_hosts\"\n value = \"true\"\n }\n\n # The \"group\" stanza defines a series of tasks that should be co-located on\n # the same Nomad client. Any task within a group will be placed on the same\n # client.\n #\n # For more information and examples on the \"group\" stanza, please see\n # the online documentation at:\n #\n # https://www.nomadproject.io/docs/job-specification/group.html\n #\n group \"prod-group1-minio\" {\n # The \"count\" parameter specifies the number of the task groups that should\n # be running under this group. This value must be non-negative and defaults\n # to 1.\n count = 4\n\n # https://www.nomadproject.io/docs/job-specification/volume\n \n volume \"prod-volume1-minio\" {\n type = \"host\"\n read_only = false\n source = \"prod-volume-data1-1\"\n }\n \n\n # The \"task\" stanza creates an individual unit of work, such as a Docker\n # container, web application, or batch processing.\n #\n # For more information and examples on the \"task\" stanza, please see\n # the online documentation at:\n #\n # https://www.nomadproject.io/docs/job-specification/task.html\n #\n task \"prod-task1-minio\" {\n # The \"driver\" parameter specifies the task driver that should be used to\n # run the task.\n driver = \"docker\"\n\n \n volume_mount {\n volume = \"prod-volume1-minio\"\n destination = \"/data/\"\n read_only = false\n }\n \n\n \n\n # The \"config\" stanza specifies the driver configuration, which is passed\n # directly to the driver to start the task. The details of configurations\n # are specific to each driver, so please see specific driver\n # documentation for more information.\n config {\n image = \"minio/minio:RELEASE.2020-12-03T05-49-24Z\"\n dns_servers = [ \"${attr.unique.network.ip-address}\" ]\n network_mode = \"host\"\n command = \"server\"\n args = [ \"http://10.32.8.1{4...7}:9000/data/\" ]\n port_map {\n http = 9000\n }\n privileged = false\n }\n\n # The env stanza configures a list of environment variables to populate\n # the task's environment before starting.\n env {\n\n MINIO_ACCESS_KEY = \"minio\"\n MINIO_SECRET_KEY = \"minio123\"\n\n MINIO_BROWSER=\"off\"\n }\n\n # The service stanza instructs Nomad to register a service with Consul.\n #\n # For more information and examples on the \"task\" stanza, please see\n # the online documentation at:\n #\n # https://www.nomadproject.io/docs/job-specification/service.html\n #\n service {\n name = \"storage\"\n port = \"http\"\n tags = [ \"storage${NOMAD_ALLOC_INDEX}\" ]\n check {\n name = \"Min.io Server HTTP Check Live\"\n type = \"http\"\n port = \"http\"\n protocol = \"http\"\n method = \"GET\"\n path = \"/minio/health/live\"\n interval = \"10s\"\n timeout = \"2s\"\n }\n check {\n name = \"Min.io Server HTTP Check Ready\"\n type = \"http\"\n port = \"http\"\n protocol = \"http\"\n method = \"GET\"\n path = \"/minio/health/ready\"\n interval = \"10s\"\n timeout = \"2s\"\n }\n }\n\n # The \"resources\" stanza describes the requirements a task needs to\n # execute. Resource requirements include memory, network, cpu, and more.\n # This ensures the task will execute on a machine that contains enough\n # resource capacity.\n #\n # For more information and examples on the \"resources\" stanza, please see\n # the online documentation at:\n #\n # https://www.nomadproject.io/docs/job-specification/resources.html\n #\n resources {\n cpu = 40000\n memory = 40000\n # The network stanza specifies the networking requirements for the task\n # group, including the network mode and port allocations. When scheduling\n # jobs in Nomad they are provisioned across your fleet of machines along\n # with other jobs and services. Because you don't know in advance what host\n # your job will be provisioned on, Nomad will provide your tasks with\n # network configuration when they start up.\n #\n # For more information and examples on the \"template\" stanza, please see\n # the online documentation at:\n #\n # https://www.nomadproject.io/docs/job-specification/network.html\n #\n network {\n port \"http\" {\n static = 9000\n }\n }\n }\n }\n }\n}\n",
+ "template": "job \"${job_name}\" {\n # The \"region\" parameter specifies the region in which to execute the job.\n # If omitted, this inherits the default region name of \"global\".\n # region = \"global\"\n #\n # The \"datacenters\" parameter specifies the list of datacenters which should\n # be considered when placing this task. This must be provided.\n datacenters = \"${datacenters}\"\n\n # The \"type\" parameter controls the type of job, which impacts the scheduler's\n # decision on placement. This configuration is optional and defaults to\n # \"service\". For a full list of job types and their differences, please see\n # the online documentation.\n #\n # For more information, please see the online documentation at:\n #\n # https://www.nomadproject.io/docs/jobspec/schedulers.html\n #\n type = \"service\"\n\n update {\n # The \"max_parallel\" parameter specifies the maximum number of updates to\n # perform in parallel. In this case, this specifies to update a single task\n # at a time.\n max_parallel = 1\n\n health_check = \"checks\"\n\n # The \"min_healthy_time\" parameter specifies the minimum time the allocation\n # must be in the healthy state before it is marked as healthy and unblocks\n # further allocations from being updated.\n min_healthy_time = \"10s\"\n\n # The \"healthy_deadline\" parameter specifies the deadline in which the\n # allocation must be marked as healthy after which the allocation is\n # automatically transitioned to unhealthy. Transitioning to unhealthy will\n # fail the deployment and potentially roll back the job if \"auto_revert\" is\n # set to true.\n healthy_deadline = \"3m\"\n\n # The \"progress_deadline\" parameter specifies the deadline in which an\n # allocation must be marked as healthy. The deadline begins when the first\n # allocation for the deployment is created and is reset whenever an allocation\n # as part of the deployment transitions to a healthy state. If no allocation\n # transitions to the healthy state before the progress deadline, the\n # deployment is marked as failed.\n progress_deadline = \"10m\"\n\n%{ if use_canary }\n # The \"canary\" parameter specifies that changes to the job that would result\n # in destructive updates should create the specified number of canaries\n # without stopping any previous allocations. Once the operator determines the\n # canaries are healthy, they can be promoted which unblocks a rolling update\n # of the remaining allocations at a rate of \"max_parallel\".\n #\n # Further, setting \"canary\" equal to the count of the task group allows\n # blue/green deployments. When the job is updated, a full set of the new\n # version is deployed and upon promotion the old version is stopped.\n canary = 1\n\n # Specifies if the job should auto-promote to the canary version when all\n # canaries become healthy during a deployment. Defaults to false which means\n # canaries must be manually updated with the nomad deployment promote\n # command.\n auto_promote = true\n\n # The \"auto_revert\" parameter specifies if the job should auto-revert to the\n # last stable job on deployment failure. A job is marked as stable if all the\n # allocations as part of its deployment were marked healthy.\n auto_revert = true\n%{ endif }\n }\n\n # All groups in this job should be scheduled on different hosts.\n constraint {\n operator = \"distinct_hosts\"\n value = \"true\"\n }\n\n # The \"group\" stanza defines a series of tasks that should be co-located on\n # the same Nomad client. Any task within a group will be placed on the same\n # client.\n #\n # For more information and examples on the \"group\" stanza, please see\n # the online documentation at:\n #\n # https://www.nomadproject.io/docs/job-specification/group.html\n #\n group \"prod-group1-minio\" {\n # The \"count\" parameter specifies the number of the task groups that should\n # be running under this group. This value must be non-negative and defaults\n # to 1.\n count = ${group_count}\n\n # https://www.nomadproject.io/docs/job-specification/volume\n %{ if use_host_volume }\n volume \"prod-volume1-minio\" {\n type = \"host\"\n read_only = false\n source = \"${host_volume}\"\n }\n %{ endif }\n\n # The \"task\" stanza creates an individual unit of work, such as a Docker\n # container, web application, or batch processing.\n #\n # For more information and examples on the \"task\" stanza, please see\n # the online documentation at:\n #\n # https://www.nomadproject.io/docs/job-specification/task.html\n #\n task \"prod-task1-minio\" {\n # The \"driver\" parameter specifies the task driver that should be used to\n # run the task.\n driver = \"docker\"\n\n %{ if use_host_volume }\n volume_mount {\n volume = \"prod-volume1-minio\"\n destination = \"${data_dir}\"\n read_only = false\n }\n %{ endif }\n\n %{ if use_vault_provider }\n vault {\n policies = \"${vault_kv_policy_name}\"\n }\n %{ endif }\n\n # The \"config\" stanza specifies the driver configuration, which is passed\n # directly to the driver to start the task. The details of configurations\n # are specific to each driver, so please see specific driver\n # documentation for more information.\n config {\n image = \"${image}\"\n dns_servers = [ \"$${attr.unique.network.ip-address}\" ]\n network_mode = \"host\"\n command = \"server\"\n args = [ \"${host}:${port}${data_dir}\" ]\n port_map {\n http = ${port}\n }\n privileged = false\n }\n\n # The env stanza configures a list of environment variables to populate\n # the task's environment before starting.\n env {\n%{ if use_vault_provider }\n{{ with secret \"${vault_kv_path}\" }}\n MINIO_ACCESS_KEY = \"{{ .Data.data.${vault_kv_field_access_key} }}\"\n MINIO_SECRET_KEY = \"{{ .Data.data.${vault_kv_field_secret_key} }}\"\n{{ end }}\n%{ else }\n MINIO_ACCESS_KEY = \"${access_key}\"\n MINIO_SECRET_KEY = \"${secret_key}\"\n%{ endif }\n ${ envs }\n }\n\n # The service stanza instructs Nomad to register a service with Consul.\n #\n # For more information and examples on the \"task\" stanza, please see\n # the online documentation at:\n #\n # https://www.nomadproject.io/docs/job-specification/service.html\n #\n service {\n name = \"${service_name}\"\n port = \"http\"\n tags = [ \"storage$${NOMAD_ALLOC_INDEX}\" ]\n check {\n name = \"Min.io Server HTTP Check Live\"\n type = \"http\"\n port = \"http\"\n protocol = \"http\"\n method = \"GET\"\n path = \"/minio/health/live\"\n interval = \"10s\"\n timeout = \"2s\"\n }\n check {\n name = \"Min.io Server HTTP Check Ready\"\n type = \"http\"\n port = \"http\"\n protocol = \"http\"\n method = \"GET\"\n path = \"/minio/health/ready\"\n interval = \"10s\"\n timeout = \"2s\"\n }\n }\n\n # The \"resources\" stanza describes the requirements a task needs to\n # execute. Resource requirements include memory, network, cpu, and more.\n # This ensures the task will execute on a machine that contains enough\n # resource capacity.\n #\n # For more information and examples on the \"resources\" stanza, please see\n # the online documentation at:\n #\n # https://www.nomadproject.io/docs/job-specification/resources.html\n #\n resources {\n cpu = ${cpu}\n memory = ${memory}\n # The network stanza specifies the networking requirements for the task\n # group, including the network mode and port allocations. When scheduling\n # jobs in Nomad they are provisioned across your fleet of machines along\n # with other jobs and services. Because you don't know in advance what host\n # your job will be provisioned on, Nomad will provide your tasks with\n # network configuration when they start up.\n #\n # For more information and examples on the \"template\" stanza, please see\n # the online documentation at:\n #\n # https://www.nomadproject.io/docs/job-specification/network.html\n #\n network {\n port \"http\" {\n static = ${port}\n }\n }\n }\n }\n }\n}\n",
+ "vars": {
+ "access_key": "minio",
+ "cpu": "40000",
+ "cpu_proxy": "200",
+ "data_dir": "/data/",
+ "datacenters": "yul1",
+ "envs": "MINIO_BROWSER=\"off\"",
+ "group_count": "4",
+ "host": "http://10.32.8.1{4...7}",
+ "host_volume": "prod-volume-data1-1",
+ "image": "minio/minio:RELEASE.2020-12-03T05-49-24Z",
+ "job_name": "prod-minio",
+ "memory": "40000",
+ "memory_proxy": "128",
+ "port": "9000",
+ "secret_key": "minio123",
+ "service_name": "storage",
+ "upstreams": "[]",
+ "use_canary": "true",
+ "use_host_volume": "true",
+ "use_vault_provider": "false"
+ }
+ },
+ "sensitive_attributes": []
+ }
+ ]
+ },
+ {
+ "module": "module.minio",
+ "mode": "managed",
+ "type": "nomad_job",
+ "name": "nomad_job_mc",
+ "provider": "provider[\"registry.terraform.io/hashicorp/nomad\"].yul1",
+ "instances": [
+ {
+ "schema_version": 0,
+ "attributes": {
+ "allocation_ids": [
+ "8ca5aa5b-2616-f167-f40e-a0becb50c87e"
+ ],
+ "datacenters": [
+ "yul1"
+ ],
+ "deployment_id": "",
+ "deployment_status": "",
+ "deregister_on_destroy": true,
+ "deregister_on_id_change": true,
+ "detach": false,
+ "id": "prod-mc",
+ "jobspec": "job \"prod-mc\" {\n # The \"region\" parameter specifies the region in which to execute the job.\n # If omitted, this inherits the default region name of \"global\".\n # region = \"global\"\n #\n # The \"datacenters\" parameter specifies the list of datacenters which should\n # be considered when placing this task. This must be provided.\n datacenters = \"yul1\"\n\n # The \"type\" parameter controls the type of job, which impacts the scheduler's\n # decision on placement. This configuration is optional and defaults to\n # \"service\". For a full list of job types and their differences, please see\n # the online documentation.\n #\n # For more information, please see the online documentation at:\n #\n # https://www.nomadproject.io/docs/jobspec/schedulers.html\n #\n type = \"batch\"\n\n # The \"group\" stanza defines a series of tasks that should be co-located on\n # the same Nomad client. Any task within a group will be placed on the same\n # client.\n #\n # For more information and examples on the \"group\" stanza, please see\n # the online documentation at:\n #\n # https://www.nomadproject.io/docs/job-specification/group.html\n #\n group \"prod-group1-mc\" {\n task \"prod-task1-create-buckets\" {\n # The \"driver\" parameter specifies the task driver that should be used to\n # run the task.\n driver = \"docker\"\n\n \n\n # The \"config\" stanza specifies the driver configuration, which is passed\n # directly to the driver to start the task. The details of configurations\n # are specific to each driver, so please see specific driver\n # documentation for more information.\n config {\n image = \"minio/mc:RELEASE.2020-12-10T01-26-17Z\"\n entrypoint = [\n \"/bin/sh\",\n \"-c\",\n \"mc config host add LOCALMINIO http://storage.service.consul:9000 $MINIO_ACCESS_KEY $MINIO_SECRET_KEY \u0026\u0026 mc mb -p LOCALMINIO/logs.fd.io LOCALMINIO/docs.fd.io ; mc policy set public LOCALMINIO/logs.fd.io mc policy set public LOCALMINIO/docs.fd.io mc ilm add --expiry-days '180' LOCALMINIO/logs.fd.io mc admin user add LOCALMINIO storage Storage1234 mc admin policy set LOCALMINIO writeonly user=storage\"\n ]\n dns_servers = [ \"${attr.unique.network.ip-address}\" ]\n privileged = false\n }\n\n # The env stanza configures a list of environment variables to populate\n # the task's environment before starting.\n env {\n \n MINIO_ACCESS_KEY = \"minio\"\n MINIO_SECRET_KEY = \"minio123\"\n \n \n }\n }\n }\n}\n",
+ "json": null,
+ "modify_index": "7148419",
+ "name": "prod-mc",
+ "namespace": "default",
+ "policy_override": null,
+ "purge_on_destroy": null,
+ "region": "global",
+ "task_groups": [
+ {
+ "count": 1,
+ "meta": {},
+ "name": "prod-group1-mc",
+ "task": [
+ {
+ "driver": "docker",
+ "meta": {},
+ "name": "prod-task1-create-buckets",
+ "volume_mounts": null
+ }
+ ],
+ "volumes": null
+ }
+ ],
+ "type": "batch"
+ },
+ "sensitive_attributes": [],
+ "private": "bnVsbA==",
+ "dependencies": [
+ "module.minio.data.template_file.nomad_job_mc",
+ "module.minio.data.template_file.nomad_job_minio",
+ "module.minio.nomad_job.nomad_job_minio"
+ ]
+ }
+ ]
+ },
+ {
+ "module": "module.minio",
+ "mode": "managed",
+ "type": "nomad_job",
+ "name": "nomad_job_minio",
+ "provider": "provider[\"registry.terraform.io/hashicorp/nomad\"].yul1",
+ "instances": [
+ {
+ "schema_version": 0,
+ "attributes": {
+ "allocation_ids": [
+ "322e6ea5-acc7-2f37-3022-cd0a31c2f3db",
+ "619833b9-dd70-46ae-f576-75c3cd058a13",
+ "0510561a-c828-dd23-5910-b92d9f2c41ef",
+ "504924c8-387b-abe2-c4d5-d6a7424a4689"
+ ],
+ "datacenters": [
+ "yul1"
+ ],
+ "deployment_id": "cd6d79cf-6e6f-289d-a0bb-6b2bb8da769a",
+ "deployment_status": "successful",
+ "deregister_on_destroy": true,
+ "deregister_on_id_change": true,
+ "detach": false,
+ "id": "prod-minio",
+ "jobspec": "job \"prod-minio\" {\n # The \"region\" parameter specifies the region in which to execute the job.\n # If omitted, this inherits the default region name of \"global\".\n # region = \"global\"\n #\n # The \"datacenters\" parameter specifies the list of datacenters which should\n # be considered when placing this task. This must be provided.\n datacenters = \"yul1\"\n\n # The \"type\" parameter controls the type of job, which impacts the scheduler's\n # decision on placement. This configuration is optional and defaults to\n # \"service\". For a full list of job types and their differences, please see\n # the online documentation.\n #\n # For more information, please see the online documentation at:\n #\n # https://www.nomadproject.io/docs/jobspec/schedulers.html\n #\n type = \"service\"\n\n update {\n # The \"max_parallel\" parameter specifies the maximum number of updates to\n # perform in parallel. In this case, this specifies to update a single task\n # at a time.\n max_parallel = 1\n\n health_check = \"checks\"\n\n # The \"min_healthy_time\" parameter specifies the minimum time the allocation\n # must be in the healthy state before it is marked as healthy and unblocks\n # further allocations from being updated.\n min_healthy_time = \"10s\"\n\n # The \"healthy_deadline\" parameter specifies the deadline in which the\n # allocation must be marked as healthy after which the allocation is\n # automatically transitioned to unhealthy. Transitioning to unhealthy will\n # fail the deployment and potentially roll back the job if \"auto_revert\" is\n # set to true.\n healthy_deadline = \"3m\"\n\n # The \"progress_deadline\" parameter specifies the deadline in which an\n # allocation must be marked as healthy. The deadline begins when the first\n # allocation for the deployment is created and is reset whenever an allocation\n # as part of the deployment transitions to a healthy state. If no allocation\n # transitions to the healthy state before the progress deadline, the\n # deployment is marked as failed.\n progress_deadline = \"10m\"\n\n\n # The \"canary\" parameter specifies that changes to the job that would result\n # in destructive updates should create the specified number of canaries\n # without stopping any previous allocations. Once the operator determines the\n # canaries are healthy, they can be promoted which unblocks a rolling update\n # of the remaining allocations at a rate of \"max_parallel\".\n #\n # Further, setting \"canary\" equal to the count of the task group allows\n # blue/green deployments. When the job is updated, a full set of the new\n # version is deployed and upon promotion the old version is stopped.\n canary = 1\n\n # Specifies if the job should auto-promote to the canary version when all\n # canaries become healthy during a deployment. Defaults to false which means\n # canaries must be manually updated with the nomad deployment promote\n # command.\n auto_promote = true\n\n # The \"auto_revert\" parameter specifies if the job should auto-revert to the\n # last stable job on deployment failure. A job is marked as stable if all the\n # allocations as part of its deployment were marked healthy.\n auto_revert = true\n\n }\n\n # All groups in this job should be scheduled on different hosts.\n constraint {\n operator = \"distinct_hosts\"\n value = \"true\"\n }\n\n # The \"group\" stanza defines a series of tasks that should be co-located on\n # the same Nomad client. Any task within a group will be placed on the same\n # client.\n #\n # For more information and examples on the \"group\" stanza, please see\n # the online documentation at:\n #\n # https://www.nomadproject.io/docs/job-specification/group.html\n #\n group \"prod-group1-minio\" {\n # The \"count\" parameter specifies the number of the task groups that should\n # be running under this group. This value must be non-negative and defaults\n # to 1.\n count = 4\n\n # https://www.nomadproject.io/docs/job-specification/volume\n \n volume \"prod-volume1-minio\" {\n type = \"host\"\n read_only = false\n source = \"prod-volume-data1-1\"\n }\n \n\n # The \"task\" stanza creates an individual unit of work, such as a Docker\n # container, web application, or batch processing.\n #\n # For more information and examples on the \"task\" stanza, please see\n # the online documentation at:\n #\n # https://www.nomadproject.io/docs/job-specification/task.html\n #\n task \"prod-task1-minio\" {\n # The \"driver\" parameter specifies the task driver that should be used to\n # run the task.\n driver = \"docker\"\n\n \n volume_mount {\n volume = \"prod-volume1-minio\"\n destination = \"/data/\"\n read_only = false\n }\n \n\n \n\n # The \"config\" stanza specifies the driver configuration, which is passed\n # directly to the driver to start the task. The details of configurations\n # are specific to each driver, so please see specific driver\n # documentation for more information.\n config {\n image = \"minio/minio:RELEASE.2020-12-03T05-49-24Z\"\n dns_servers = [ \"${attr.unique.network.ip-address}\" ]\n network_mode = \"host\"\n command = \"server\"\n args = [ \"http://10.32.8.1{4...7}:9000/data/\" ]\n port_map {\n http = 9000\n }\n privileged = false\n }\n\n # The env stanza configures a list of environment variables to populate\n # the task's environment before starting.\n env {\n\n MINIO_ACCESS_KEY = \"minio\"\n MINIO_SECRET_KEY = \"minio123\"\n\n MINIO_BROWSER=\"off\"\n }\n\n # The service stanza instructs Nomad to register a service with Consul.\n #\n # For more information and examples on the \"task\" stanza, please see\n # the online documentation at:\n #\n # https://www.nomadproject.io/docs/job-specification/service.html\n #\n service {\n name = \"storage\"\n port = \"http\"\n tags = [ \"storage${NOMAD_ALLOC_INDEX}\" ]\n check {\n name = \"Min.io Server HTTP Check Live\"\n type = \"http\"\n port = \"http\"\n protocol = \"http\"\n method = \"GET\"\n path = \"/minio/health/live\"\n interval = \"10s\"\n timeout = \"2s\"\n }\n check {\n name = \"Min.io Server HTTP Check Ready\"\n type = \"http\"\n port = \"http\"\n protocol = \"http\"\n method = \"GET\"\n path = \"/minio/health/ready\"\n interval = \"10s\"\n timeout = \"2s\"\n }\n }\n\n # The \"resources\" stanza describes the requirements a task needs to\n # execute. Resource requirements include memory, network, cpu, and more.\n # This ensures the task will execute on a machine that contains enough\n # resource capacity.\n #\n # For more information and examples on the \"resources\" stanza, please see\n # the online documentation at:\n #\n # https://www.nomadproject.io/docs/job-specification/resources.html\n #\n resources {\n # The network stanza specifies the networking requirements for the task\n # group, including the network mode and port allocations. When scheduling\n # jobs in Nomad they are provisioned across your fleet of machines along\n # with other jobs and services. Because you don't know in advance what host\n # your job will be provisioned on, Nomad will provide your tasks with\n # network configuration when they start up.\n #\n # For more information and examples on the \"template\" stanza, please see\n # the online documentation at:\n #\n # https://www.nomadproject.io/docs/job-specification/network.html\n #\n cpu = 40000\n memory = 40000\n network {\n port \"http\" {\n static = 9000\n }\n }\n }\n }\n }\n}\n",
+ "json": null,
+ "modify_index": "5933603",
+ "name": "prod-minio",
+ "namespace": "default",
+ "policy_override": null,
+ "purge_on_destroy": null,
+ "region": "global",
+ "task_groups": [
+ {
+ "count": 4,
+ "meta": {},
+ "name": "prod-group1-minio",
+ "task": [
+ {
+ "driver": "docker",
+ "meta": {},
+ "name": "prod-task1-minio",
+ "volume_mounts": [
+ {
+ "destination": "/data/",
+ "read_only": false,
+ "volume": "prod-volume1-minio"
+ }
+ ]
+ }
+ ],
+ "volumes": [
+ {
+ "name": "prod-volume1-minio",
+ "read_only": false,
+ "source": "prod-volume-data1-1",
+ "type": "host"
+ }
+ ]
+ }
+ ],
+ "type": "service"
+ },
+ "sensitive_attributes": [],
+ "private": "bnVsbA==",
+ "dependencies": [
+ "module.minio.data.template_file.nomad_job_minio"
+ ]
+ }
+ ]
+ },
+ {
+ "module": "module.nginx",
+ "mode": "data",
+ "type": "template_file",
+ "name": "nomad_job_nginx",
+ "provider": "provider[\"registry.terraform.io/hashicorp/template\"]",
+ "instances": [
+ {
+ "schema_version": 0,
+ "attributes": {
+ "filename": null,
+ "id": "2816603491214d9abe10981b9f9cc6a4cb806acbc6981b952653847fec86e9ff",
+ "rendered": "job \"prod-nginx\" {\n # The \"region\" parameter specifies the region in which to execute the job.\n # If omitted, this inherits the default region name of \"global\".\n # region = \"global\"\n #\n # The \"datacenters\" parameter specifies the list of datacenters which should\n # be considered when placing this task. This must be provided.\n datacenters = \"yul1\"\n\n # The \"type\" parameter controls the type of job, which impacts the scheduler's\n # decision on placement. This configuration is optional and defaults to\n # \"service\". For a full list of job types and their differences, please see\n # the online documentation.\n #\n # For more information, please see the online documentation at:\n #\n # https://www.nomadproject.io/docs/jobspec/schedulers.html\n #\n type = \"service\"\n\n update {\n # The \"max_parallel\" parameter specifies the maximum number of updates to\n # perform in parallel. In this case, this specifies to update a single task\n # at a time.\n max_parallel = 0\n\n # The \"min_healthy_time\" parameter specifies the minimum time the allocation\n # must be in the healthy state before it is marked as healthy and unblocks\n # further allocations from being updated.\n min_healthy_time = \"10s\"\n\n # The \"healthy_deadline\" parameter specifies the deadline in which the\n # allocation must be marked as healthy after which the allocation is\n # automatically transitioned to unhealthy. Transitioning to unhealthy will\n # fail the deployment and potentially roll back the job if \"auto_revert\" is\n # set to true.\n healthy_deadline = \"3m\"\n\n # The \"progress_deadline\" parameter specifies the deadline in which an\n # allocation must be marked as healthy. The deadline begins when the first\n # allocation for the deployment is created and is reset whenever an allocation\n # as part of the deployment transitions to a healthy state. If no allocation\n # transitions to the healthy state before the progress deadline, the\n # deployment is marked as failed.\n progress_deadline = \"10m\"\n\n # The \"auto_revert\" parameter specifies if the job should auto-revert to the\n # last stable job on deployment failure. A job is marked as stable if all the\n # allocations as part of its deployment were marked healthy.\n auto_revert = false\n\n # The \"canary\" parameter specifies that changes to the job that would result\n # in destructive updates should create the specified number of canaries\n # without stopping any previous allocations. Once the operator determines the\n # canaries are healthy, they can be promoted which unblocks a rolling update\n # of the remaining allocations at a rate of \"max_parallel\".\n #\n # Further, setting \"canary\" equal to the count of the task group allows\n # blue/green deployments. When the job is updated, a full set of the new\n # version is deployed and upon promotion the old version is stopped.\n canary = 0\n }\n\n # The reschedule stanza specifies the group's rescheduling strategy. If\n # specified at the job level, the configuration will apply to all groups\n # within the job. If the reschedule stanza is present on both the job and the\n # group, they are merged with the group stanza taking the highest precedence\n # and then the job.\n reschedule {\n delay = \"30s\"\n delay_function = \"constant\"\n unlimited = true\n }\n\n # The \"group\" stanza defines a series of tasks that should be co-located on\n # the same Nomad client. Any task within a group will be placed on the same\n # client.\n #\n # For more information and examples on the \"group\" stanza, please see\n # the online documentation at:\n #\n # https://www.nomadproject.io/docs/job-specification/group.html\n #\n group \"prod-group1-nginx\" {\n # The \"count\" parameter specifies the number of the task groups that should\n # be running under this group. This value must be non-negative and defaults\n # to 1.\n count = 1\n\n # The restart stanza configures a tasks's behavior on task failure. Restarts\n # happen on the client that is running the task.\n restart {\n interval = \"10m\"\n attempts = 2\n delay = \"15s\"\n mode = \"fail\"\n }\n\n # The \"task\" stanza creates an individual unit of work, such as a Docker\n # container, web application, or batch processing.\n #\n # For more information and examples on the \"task\" stanza, please see\n # the online documentation at:\n #\n # https://www.nomadproject.io/docs/job-specification/task.html\n #\n task \"prod-task1-nginx\" {\n # The \"driver\" parameter specifies the task driver that should be used to\n # run the task.\n driver = \"docker\"\n\n # The \"config\" stanza specifies the driver configuration, which is passed\n # directly to the driver to start the task. The details of configurations\n # are specific to each driver, so please see specific driver\n # documentation for more information.\n config {\n image = \"nginx:stable\"\n dns_servers = [ \"${attr.unique.network.ip-address}\" ]\n port_map {\n https = 443\n }\n privileged = false\n volumes = [\n \"/etc/consul.d/ssl/consul.pem:/etc/ssl/certs/nginx-cert.pem\",\n \"/etc/consul.d/ssl/consul-key.pem:/etc/ssl/private/nginx-key.pem\",\n \"custom/upstream.conf:/etc/nginx/conf.d/upstream.conf\",\n \"custom/logs.conf:/etc/nginx/conf.d/logs.conf\",\n \"custom/docs.conf:/etc/nginx/conf.d/docs.conf\"\n ]\n }\n\n # The \"template\" stanza instructs Nomad to manage a template, such as\n # a configuration file or script. This template can optionally pull data\n # from Consul or Vault to populate runtime configuration data.\n #\n # For more information and examples on the \"template\" stanza, please see\n # the online documentation at:\n #\n # https://www.nomadproject.io/docs/job-specification/template.html\n #\n template {\n data = \u003c\u003cEOH\n upstream storage {\n server storage0.storage.service.consul:9000;\n server storage1.storage.service.consul:9000;\n server storage2.storage.service.consul:9000;\n server storage3.storage.service.consul:9000;\n }\n EOH\n destination = \"custom/upstream.conf\"\n }\n template {\n data = \u003c\u003cEOH\n server {\n listen 443 ssl default_server;\n server_name logs.nginx.service.consul;\n keepalive_timeout 70;\n ssl_session_cache shared:SSL:10m;\n ssl_session_timeout 10m;\n ssl_protocols TLSv1.2;\n ssl_prefer_server_ciphers on;\n ssl_ciphers \"ECDHE-ECDSA-AES256-GCM-SHA384:ECDHE-RSA-AES256-GCM-SHA384:ECDHE-ECDSA-CHACHA20-POLY1305:ECDHE-RSA-CHACHA20-POLY1305:ECDHE-ECDSA-AES256-SHA384:ECDHE-RSA-AES256-SHA384\";\n ssl_certificate /etc/ssl/certs/nginx-cert.pem;\n ssl_certificate_key /etc/ssl/private/nginx-key.pem;\n location / {\n chunked_transfer_encoding off;\n proxy_connect_timeout 300;\n proxy_http_version 1.1;\n proxy_set_header Host $host:$server_port;\n proxy_set_header Connection \"\";\n proxy_pass http://storage/logs.fd.io/;\n server_name_in_redirect off;\n }\n location ~ (.*html.gz)$ {\n add_header Content-Encoding gzip;\n add_header Content-Type text/html;\n chunked_transfer_encoding off;\n proxy_connect_timeout 300;\n proxy_http_version 1.1;\n proxy_set_header Host $host:$server_port;\n proxy_set_header Connection \"\";\n proxy_pass http://storage/logs.fd.io/$1;\n server_name_in_redirect off;\n }\n location ~ (.*txt.gz|.*log.gz)$ {\n add_header Content-Encoding gzip;\n add_header Content-Type text/plain;\n chunked_transfer_encoding off;\n proxy_connect_timeout 300;\n proxy_http_version 1.1;\n proxy_set_header Host $host:$server_port;\n proxy_set_header Connection \"\";\n proxy_pass http://storage/logs.fd.io/$1;\n server_name_in_redirect off;\n }\n location ~ (.*xml.gz)$ {\n add_header Content-Encoding gzip;\n add_header Content-Type application/xml;\n chunked_transfer_encoding off;\n proxy_connect_timeout 300;\n proxy_http_version 1.1;\n proxy_set_header Host $host:$server_port;\n proxy_set_header Connection \"\";\n proxy_pass http://storage/logs.fd.io/$1;\n server_name_in_redirect off;\n }\n }\n EOH\n destination = \"custom/logs.conf\"\n }\n template {\n data = \u003c\u003cEOH\n server {\n listen 443 ssl;\n server_name docs.nginx.service.consul;\n keepalive_timeout 70;\n ssl_session_cache shared:SSL:10m;\n ssl_session_timeout 10m;\n ssl_protocols TLSv1.2;\n ssl_prefer_server_ciphers on;\n ssl_ciphers \"ECDHE-ECDSA-AES256-GCM-SHA384:ECDHE-RSA-AES256-GCM-SHA384:ECDHE-ECDSA-CHACHA20-POLY1305:ECDHE-RSA-CHACHA20-POLY1305:ECDHE-ECDSA-AES256-SHA384:ECDHE-RSA-AES256-SHA384\";\n ssl_certificate /etc/ssl/certs/nginx-cert.pem;\n ssl_certificate_key /etc/ssl/private/nginx-key.pem;\n location / {\n chunked_transfer_encoding off;\n proxy_connect_timeout 300;\n proxy_http_version 1.1;\n proxy_set_header Host $host:$server_port;\n proxy_set_header Connection \"\";\n proxy_pass http://storage/docs.fd.io/;\n server_name_in_redirect off;\n }\n }\n EOH\n destination = \"custom/docs.conf\"\n }\n\n # The service stanza instructs Nomad to register a service with Consul.\n #\n # For more information and examples on the \"task\" stanza, please see\n # the online documentation at:\n #\n # https://www.nomadproject.io/docs/job-specification/service.html\n #\n service {\n name = \"nginx\"\n port = \"https\"\n tags = [ \"docs\", \"logs\" ]\n }\n\n # The \"resources\" stanza describes the requirements a task needs to\n # execute. Resource requirements include memory, network, cpu, and more.\n # This ensures the task will execute on a machine that contains enough\n # resource capacity.\n #\n # For more information and examples on the \"resources\" stanza, please see\n # the online documentation at:\n #\n # https://www.nomadproject.io/docs/job-specification/resources.html\n #\n resources {\n cpu = 1000\n memory = 1024\n network {\n mode = \"bridge\"\n port \"https\" {\n static = 443\n }\n }\n }\n }\n }\n}",
+ "template": "job \"${job_name}\" {\n # The \"region\" parameter specifies the region in which to execute the job.\n # If omitted, this inherits the default region name of \"global\".\n # region = \"global\"\n #\n # The \"datacenters\" parameter specifies the list of datacenters which should\n # be considered when placing this task. This must be provided.\n datacenters = \"${datacenters}\"\n\n # The \"type\" parameter controls the type of job, which impacts the scheduler's\n # decision on placement. This configuration is optional and defaults to\n # \"service\". For a full list of job types and their differences, please see\n # the online documentation.\n #\n # For more information, please see the online documentation at:\n #\n # https://www.nomadproject.io/docs/jobspec/schedulers.html\n #\n type = \"service\"\n\n update {\n # The \"max_parallel\" parameter specifies the maximum number of updates to\n # perform in parallel. In this case, this specifies to update a single task\n # at a time.\n max_parallel = 0\n\n # The \"min_healthy_time\" parameter specifies the minimum time the allocation\n # must be in the healthy state before it is marked as healthy and unblocks\n # further allocations from being updated.\n min_healthy_time = \"10s\"\n\n # The \"healthy_deadline\" parameter specifies the deadline in which the\n # allocation must be marked as healthy after which the allocation is\n # automatically transitioned to unhealthy. Transitioning to unhealthy will\n # fail the deployment and potentially roll back the job if \"auto_revert\" is\n # set to true.\n healthy_deadline = \"3m\"\n\n # The \"progress_deadline\" parameter specifies the deadline in which an\n # allocation must be marked as healthy. The deadline begins when the first\n # allocation for the deployment is created and is reset whenever an allocation\n # as part of the deployment transitions to a healthy state. If no allocation\n # transitions to the healthy state before the progress deadline, the\n # deployment is marked as failed.\n progress_deadline = \"10m\"\n\n # The \"auto_revert\" parameter specifies if the job should auto-revert to the\n # last stable job on deployment failure. A job is marked as stable if all the\n # allocations as part of its deployment were marked healthy.\n auto_revert = false\n\n # The \"canary\" parameter specifies that changes to the job that would result\n # in destructive updates should create the specified number of canaries\n # without stopping any previous allocations. Once the operator determines the\n # canaries are healthy, they can be promoted which unblocks a rolling update\n # of the remaining allocations at a rate of \"max_parallel\".\n #\n # Further, setting \"canary\" equal to the count of the task group allows\n # blue/green deployments. When the job is updated, a full set of the new\n # version is deployed and upon promotion the old version is stopped.\n canary = 0\n }\n\n # The reschedule stanza specifies the group's rescheduling strategy. If\n # specified at the job level, the configuration will apply to all groups\n # within the job. If the reschedule stanza is present on both the job and the\n # group, they are merged with the group stanza taking the highest precedence\n # and then the job.\n reschedule {\n delay = \"30s\"\n delay_function = \"constant\"\n unlimited = true\n }\n\n # The \"group\" stanza defines a series of tasks that should be co-located on\n # the same Nomad client. Any task within a group will be placed on the same\n # client.\n #\n # For more information and examples on the \"group\" stanza, please see\n # the online documentation at:\n #\n # https://www.nomadproject.io/docs/job-specification/group.html\n #\n group \"prod-group1-nginx\" {\n # The \"count\" parameter specifies the number of the task groups that should\n # be running under this group. This value must be non-negative and defaults\n # to 1.\n count = 1\n\n # The restart stanza configures a tasks's behavior on task failure. Restarts\n # happen on the client that is running the task.\n restart {\n interval = \"10m\"\n attempts = 2\n delay = \"15s\"\n mode = \"fail\"\n }\n\n # The \"task\" stanza creates an individual unit of work, such as a Docker\n # container, web application, or batch processing.\n #\n # For more information and examples on the \"task\" stanza, please see\n # the online documentation at:\n #\n # https://www.nomadproject.io/docs/job-specification/task.html\n #\n task \"prod-task1-nginx\" {\n # The \"driver\" parameter specifies the task driver that should be used to\n # run the task.\n driver = \"docker\"\n\n # The \"config\" stanza specifies the driver configuration, which is passed\n # directly to the driver to start the task. The details of configurations\n # are specific to each driver, so please see specific driver\n # documentation for more information.\n config {\n image = \"nginx:stable\"\n dns_servers = [ \"$${attr.unique.network.ip-address}\" ]\n port_map {\n https = 443\n }\n privileged = false\n volumes = [\n \"/etc/consul.d/ssl/consul.pem:/etc/ssl/certs/nginx-cert.pem\",\n \"/etc/consul.d/ssl/consul-key.pem:/etc/ssl/private/nginx-key.pem\",\n \"custom/upstream.conf:/etc/nginx/conf.d/upstream.conf\",\n \"custom/logs.conf:/etc/nginx/conf.d/logs.conf\",\n \"custom/docs.conf:/etc/nginx/conf.d/docs.conf\"\n ]\n }\n\n # The \"template\" stanza instructs Nomad to manage a template, such as\n # a configuration file or script. This template can optionally pull data\n # from Consul or Vault to populate runtime configuration data.\n #\n # For more information and examples on the \"template\" stanza, please see\n # the online documentation at:\n #\n # https://www.nomadproject.io/docs/job-specification/template.html\n #\n template {\n data = \u003c\u003cEOH\n upstream storage {\n server storage0.storage.service.consul:9000;\n server storage1.storage.service.consul:9000;\n server storage2.storage.service.consul:9000;\n server storage3.storage.service.consul:9000;\n }\n EOH\n destination = \"custom/upstream.conf\"\n }\n template {\n data = \u003c\u003cEOH\n server {\n listen 443 ssl default_server;\n server_name logs.nginx.service.consul;\n keepalive_timeout 70;\n ssl_session_cache shared:SSL:10m;\n ssl_session_timeout 10m;\n ssl_protocols TLSv1.2;\n ssl_prefer_server_ciphers on;\n ssl_ciphers \"ECDHE-ECDSA-AES256-GCM-SHA384:ECDHE-RSA-AES256-GCM-SHA384:ECDHE-ECDSA-CHACHA20-POLY1305:ECDHE-RSA-CHACHA20-POLY1305:ECDHE-ECDSA-AES256-SHA384:ECDHE-RSA-AES256-SHA384\";\n ssl_certificate /etc/ssl/certs/nginx-cert.pem;\n ssl_certificate_key /etc/ssl/private/nginx-key.pem;\n location / {\n chunked_transfer_encoding off;\n proxy_connect_timeout 300;\n proxy_http_version 1.1;\n proxy_set_header Host $host:$server_port;\n proxy_set_header Connection \"\";\n proxy_pass http://storage/logs.fd.io/;\n server_name_in_redirect off;\n }\n location ~ (.*html.gz)$ {\n add_header Content-Encoding gzip;\n add_header Content-Type text/html;\n chunked_transfer_encoding off;\n proxy_connect_timeout 300;\n proxy_http_version 1.1;\n proxy_set_header Host $host:$server_port;\n proxy_set_header Connection \"\";\n proxy_pass http://storage/logs.fd.io/$1;\n server_name_in_redirect off;\n }\n location ~ (.*txt.gz|.*log.gz)$ {\n add_header Content-Encoding gzip;\n add_header Content-Type text/plain;\n chunked_transfer_encoding off;\n proxy_connect_timeout 300;\n proxy_http_version 1.1;\n proxy_set_header Host $host:$server_port;\n proxy_set_header Connection \"\";\n proxy_pass http://storage/logs.fd.io/$1;\n server_name_in_redirect off;\n }\n location ~ (.*xml.gz)$ {\n add_header Content-Encoding gzip;\n add_header Content-Type application/xml;\n chunked_transfer_encoding off;\n proxy_connect_timeout 300;\n proxy_http_version 1.1;\n proxy_set_header Host $host:$server_port;\n proxy_set_header Connection \"\";\n proxy_pass http://storage/logs.fd.io/$1;\n server_name_in_redirect off;\n }\n }\n EOH\n destination = \"custom/logs.conf\"\n }\n template {\n data = \u003c\u003cEOH\n server {\n listen 443 ssl;\n server_name docs.nginx.service.consul;\n keepalive_timeout 70;\n ssl_session_cache shared:SSL:10m;\n ssl_session_timeout 10m;\n ssl_protocols TLSv1.2;\n ssl_prefer_server_ciphers on;\n ssl_ciphers \"ECDHE-ECDSA-AES256-GCM-SHA384:ECDHE-RSA-AES256-GCM-SHA384:ECDHE-ECDSA-CHACHA20-POLY1305:ECDHE-RSA-CHACHA20-POLY1305:ECDHE-ECDSA-AES256-SHA384:ECDHE-RSA-AES256-SHA384\";\n ssl_certificate /etc/ssl/certs/nginx-cert.pem;\n ssl_certificate_key /etc/ssl/private/nginx-key.pem;\n location / {\n chunked_transfer_encoding off;\n proxy_connect_timeout 300;\n proxy_http_version 1.1;\n proxy_set_header Host $host:$server_port;\n proxy_set_header Connection \"\";\n proxy_pass http://storage/docs.fd.io/;\n server_name_in_redirect off;\n }\n }\n EOH\n destination = \"custom/docs.conf\"\n }\n\n # The service stanza instructs Nomad to register a service with Consul.\n #\n # For more information and examples on the \"task\" stanza, please see\n # the online documentation at:\n #\n # https://www.nomadproject.io/docs/job-specification/service.html\n #\n service {\n name = \"nginx\"\n port = \"https\"\n tags = [ \"docs\", \"logs\" ]\n }\n\n # The \"resources\" stanza describes the requirements a task needs to\n # execute. Resource requirements include memory, network, cpu, and more.\n # This ensures the task will execute on a machine that contains enough\n # resource capacity.\n #\n # For more information and examples on the \"resources\" stanza, please see\n # the online documentation at:\n #\n # https://www.nomadproject.io/docs/job-specification/resources.html\n #\n resources {\n cpu = 1000\n memory = 1024\n network {\n mode = \"bridge\"\n port \"https\" {\n static = 443\n }\n }\n }\n }\n }\n}",
+ "vars": {
+ "datacenters": "yul1",
+ "job_name": "prod-nginx"
+ }
+ },
+ "sensitive_attributes": []
+ }
+ ]
+ },
+ {
+ "module": "module.nginx",
+ "mode": "managed",
+ "type": "nomad_job",
+ "name": "nomad_job_nginx",
+ "provider": "provider[\"registry.terraform.io/hashicorp/nomad\"].yul1",
+ "instances": [
+ {
+ "schema_version": 0,
+ "attributes": {
+ "allocation_ids": [
+ "1dea6de0-d5cd-2a1b-fb42-0468e986e0d6"
+ ],
+ "datacenters": [
+ "yul1"
+ ],
+ "deployment_id": "",
+ "deployment_status": "",
+ "deregister_on_destroy": true,
+ "deregister_on_id_change": true,
+ "detach": false,
+ "id": "prod-nginx",
+ "jobspec": "job \"prod-nginx\" {\n # The \"region\" parameter specifies the region in which to execute the job.\n # If omitted, this inherits the default region name of \"global\".\n # region = \"global\"\n #\n # The \"datacenters\" parameter specifies the list of datacenters which should\n # be considered when placing this task. This must be provided.\n datacenters = \"yul1\"\n\n # The \"type\" parameter controls the type of job, which impacts the scheduler's\n # decision on placement. This configuration is optional and defaults to\n # \"service\". For a full list of job types and their differences, please see\n # the online documentation.\n #\n # For more information, please see the online documentation at:\n #\n # https://www.nomadproject.io/docs/jobspec/schedulers.html\n #\n type = \"service\"\n\n update {\n # The \"max_parallel\" parameter specifies the maximum number of updates to\n # perform in parallel. In this case, this specifies to update a single task\n # at a time.\n max_parallel = 0\n\n # The \"min_healthy_time\" parameter specifies the minimum time the allocation\n # must be in the healthy state before it is marked as healthy and unblocks\n # further allocations from being updated.\n min_healthy_time = \"10s\"\n\n # The \"healthy_deadline\" parameter specifies the deadline in which the\n # allocation must be marked as healthy after which the allocation is\n # automatically transitioned to unhealthy. Transitioning to unhealthy will\n # fail the deployment and potentially roll back the job if \"auto_revert\" is\n # set to true.\n healthy_deadline = \"3m\"\n\n # The \"progress_deadline\" parameter specifies the deadline in which an\n # allocation must be marked as healthy. The deadline begins when the first\n # allocation for the deployment is created and is reset whenever an allocation\n # as part of the deployment transitions to a healthy state. If no allocation\n # transitions to the healthy state before the progress deadline, the\n # deployment is marked as failed.\n progress_deadline = \"10m\"\n\n # The \"auto_revert\" parameter specifies if the job should auto-revert to the\n # last stable job on deployment failure. A job is marked as stable if all the\n # allocations as part of its deployment were marked healthy.\n auto_revert = false\n\n # The \"canary\" parameter specifies that changes to the job that would result\n # in destructive updates should create the specified number of canaries\n # without stopping any previous allocations. Once the operator determines the\n # canaries are healthy, they can be promoted which unblocks a rolling update\n # of the remaining allocations at a rate of \"max_parallel\".\n #\n # Further, setting \"canary\" equal to the count of the task group allows\n # blue/green deployments. When the job is updated, a full set of the new\n # version is deployed and upon promotion the old version is stopped.\n canary = 0\n }\n\n # The reschedule stanza specifies the group's rescheduling strategy. If\n # specified at the job level, the configuration will apply to all groups\n # within the job. If the reschedule stanza is present on both the job and the\n # group, they are merged with the group stanza taking the highest precedence\n # and then the job.\n reschedule {\n delay = \"30s\"\n delay_function = \"constant\"\n unlimited = true\n }\n\n # The \"group\" stanza defines a series of tasks that should be co-located on\n # the same Nomad client. Any task within a group will be placed on the same\n # client.\n #\n # For more information and examples on the \"group\" stanza, please see\n # the online documentation at:\n #\n # https://www.nomadproject.io/docs/job-specification/group.html\n #\n group \"prod-group1-nginx\" {\n # The \"count\" parameter specifies the number of the task groups that should\n # be running under this group. This value must be non-negative and defaults\n # to 1.\n count = 1\n\n # The restart stanza configures a tasks's behavior on task failure. Restarts\n # happen on the client that is running the task.\n restart {\n interval = \"10m\"\n attempts = 2\n delay = \"15s\"\n mode = \"fail\"\n }\n\n # The \"task\" stanza creates an individual unit of work, such as a Docker\n # container, web application, or batch processing.\n #\n # For more information and examples on the \"task\" stanza, please see\n # the online documentation at:\n #\n # https://www.nomadproject.io/docs/job-specification/task.html\n #\n task \"prod-task1-nginx\" {\n # The \"driver\" parameter specifies the task driver that should be used to\n # run the task.\n driver = \"docker\"\n\n # The \"config\" stanza specifies the driver configuration, which is passed\n # directly to the driver to start the task. The details of configurations\n # are specific to each driver, so please see specific driver\n # documentation for more information.\n config {\n image = \"nginx:stable\"\n dns_servers = [ \"${attr.unique.network.ip-address}\" ]\n port_map {\n https = 443\n }\n privileged = false\n volumes = [\n \"/etc/consul.d/ssl/consul.pem:/etc/ssl/certs/nginx-cert.pem\",\n \"/etc/consul.d/ssl/consul-key.pem:/etc/ssl/private/nginx-key.pem\",\n \"custom/upstream.conf:/etc/nginx/conf.d/upstream.conf\",\n \"custom/logs.conf:/etc/nginx/conf.d/logs.conf\",\n \"custom/docs.conf:/etc/nginx/conf.d/docs.conf\"\n ]\n }\n\n # The \"template\" stanza instructs Nomad to manage a template, such as\n # a configuration file or script. This template can optionally pull data\n # from Consul or Vault to populate runtime configuration data.\n #\n # For more information and examples on the \"template\" stanza, please see\n # the online documentation at:\n #\n # https://www.nomadproject.io/docs/job-specification/template.html\n #\n template {\n data = \u003c\u003cEOH\n upstream storage {\n server storage0.storage.service.consul:9000;\n server storage1.storage.service.consul:9000;\n server storage2.storage.service.consul:9000;\n server storage3.storage.service.consul:9000;\n }\n EOH\n destination = \"custom/upstream.conf\"\n }\n template {\n data = \u003c\u003cEOH\n server {\n listen 443 ssl default_server;\n server_name logs.nginx.service.consul;\n keepalive_timeout 70;\n ssl_session_cache shared:SSL:10m;\n ssl_session_timeout 10m;\n ssl_protocols TLSv1.2;\n ssl_prefer_server_ciphers on;\n ssl_ciphers \"ECDHE-ECDSA-AES256-GCM-SHA384:ECDHE-RSA-AES256-GCM-SHA384:ECDHE-ECDSA-CHACHA20-POLY1305:ECDHE-RSA-CHACHA20-POLY1305:ECDHE-ECDSA-AES256-SHA384:ECDHE-RSA-AES256-SHA384\";\n ssl_certificate /etc/ssl/certs/nginx-cert.pem;\n ssl_certificate_key /etc/ssl/private/nginx-key.pem;\n location / {\n chunked_transfer_encoding off;\n proxy_connect_timeout 300;\n proxy_http_version 1.1;\n proxy_set_header Host $host:$server_port;\n proxy_set_header Connection \"\";\n proxy_pass http://storage/logs.fd.io/;\n server_name_in_redirect off;\n }\n location ~ (.*html.gz)$ {\n add_header Content-Encoding gzip;\n add_header Content-Type text/html;\n chunked_transfer_encoding off;\n proxy_connect_timeout 300;\n proxy_http_version 1.1;\n proxy_set_header Host $host:$server_port;\n proxy_set_header Connection \"\";\n proxy_pass http://storage/logs.fd.io/$1;\n server_name_in_redirect off;\n }\n location ~ (.*txt.gz|.*log.gz)$ {\n add_header Content-Encoding gzip;\n add_header Content-Type text/plain;\n chunked_transfer_encoding off;\n proxy_connect_timeout 300;\n proxy_http_version 1.1;\n proxy_set_header Host $host:$server_port;\n proxy_set_header Connection \"\";\n proxy_pass http://storage/logs.fd.io/$1;\n server_name_in_redirect off;\n }\n location ~ (.*xml.gz)$ {\n add_header Content-Encoding gzip;\n add_header Content-Type application/xml;\n chunked_transfer_encoding off;\n proxy_connect_timeout 300;\n proxy_http_version 1.1;\n proxy_set_header Host $host:$server_port;\n proxy_set_header Connection \"\";\n proxy_pass http://storage/logs.fd.io/$1;\n server_name_in_redirect off;\n }\n }\n EOH\n destination = \"custom/logs.conf\"\n }\n template {\n data = \u003c\u003cEOH\n server {\n listen 443 ssl;\n server_name docs.nginx.service.consul;\n keepalive_timeout 70;\n ssl_session_cache shared:SSL:10m;\n ssl_session_timeout 10m;\n ssl_protocols TLSv1.2;\n ssl_prefer_server_ciphers on;\n ssl_ciphers \"ECDHE-ECDSA-AES256-GCM-SHA384:ECDHE-RSA-AES256-GCM-SHA384:ECDHE-ECDSA-CHACHA20-POLY1305:ECDHE-RSA-CHACHA20-POLY1305:ECDHE-ECDSA-AES256-SHA384:ECDHE-RSA-AES256-SHA384\";\n ssl_certificate /etc/ssl/certs/nginx-cert.pem;\n ssl_certificate_key /etc/ssl/private/nginx-key.pem;\n location / {\n chunked_transfer_encoding off;\n proxy_connect_timeout 300;\n proxy_http_version 1.1;\n proxy_set_header Host $host:$server_port;\n proxy_set_header Connection \"\";\n proxy_pass http://storage/docs.fd.io/;\n server_name_in_redirect off;\n }\n }\n EOH\n destination = \"custom/docs.conf\"\n }\n\n # The service stanza instructs Nomad to register a service with Consul.\n #\n # For more information and examples on the \"task\" stanza, please see\n # the online documentation at:\n #\n # https://www.nomadproject.io/docs/job-specification/service.html\n #\n service {\n name = \"nginx\"\n port = \"https\"\n tags = [ \"docs\", \"logs\" ]\n }\n\n # The \"resources\" stanza describes the requirements a task needs to\n # execute. Resource requirements include memory, network, cpu, and more.\n # This ensures the task will execute on a machine that contains enough\n # resource capacity.\n #\n # For more information and examples on the \"resources\" stanza, please see\n # the online documentation at:\n #\n # https://www.nomadproject.io/docs/job-specification/resources.html\n #\n resources {\n cpu = 1000\n memory = 1024\n network {\n mode = \"bridge\"\n port \"https\" {\n static = 443\n }\n }\n }\n }\n }\n}",
+ "json": null,
+ "modify_index": "5922474",
+ "name": "prod-nginx",
+ "namespace": "default",
+ "policy_override": null,
+ "purge_on_destroy": null,
+ "region": "global",
+ "task_groups": [
+ {
+ "count": 1,
+ "meta": {},
+ "name": "prod-group1-nginx",
+ "task": [
+ {
+ "driver": "docker",
+ "meta": {},
+ "name": "prod-task1-nginx",
+ "volume_mounts": []
+ }
+ ],
+ "volumes": []
+ }
+ ],
+ "type": "service"
+ },
+ "sensitive_attributes": [],
+ "private": "bnVsbA==",
+ "dependencies": [
+ "module.nginx.data.template_file.nomad_job_nginx"
+ ]
+ }
+ ]
+ },
+ {
+ "module": "module.prometheus",
+ "mode": "data",
+ "type": "template_file",
+ "name": "nomad_job_prometheus",
+ "provider": "provider[\"registry.terraform.io/hashicorp/template\"]",
+ "instances": [
+ {
+ "schema_version": 0,
+ "attributes": {
+ "filename": null,
+ "id": "cd26d062785357d96c95d6f8f40b2d8d7b430be3399de176e6b250822c4af86c",
+ "rendered": "job \"prod-prometheus\" {\n # The \"region\" parameter specifies the region in which to execute the job.\n # If omitted, this inherits the default region name of \"global\".\n # region = \"global\"\n #\n # The \"datacenters\" parameter specifies the list of datacenters which should\n # be considered when placing this task. This must be provided.\n datacenters = \"yul1\"\n\n # The \"type\" parameter controls the type of job, which impacts the scheduler's\n # decision on placement. This configuration is optional and defaults to\n # \"service\". For a full list of job types and their differences, please see\n # the online documentation.\n #\n # For more information, please see the online documentation at:\n #\n # https://www.nomadproject.io/docs/jobspec/schedulers\n #\n type = \"service\"\n\n update {\n # The \"max_parallel\" parameter specifies the maximum number of updates to\n # perform in parallel. In this case, this specifies to update a single task\n # at a time.\n max_parallel = 1\n\n health_check = \"checks\"\n\n # The \"min_healthy_time\" parameter specifies the minimum time the allocation\n # must be in the healthy state before it is marked as healthy and unblocks\n # further allocations from being updated.\n min_healthy_time = \"10s\"\n\n # The \"healthy_deadline\" parameter specifies the deadline in which the\n # allocation must be marked as healthy after which the allocation is\n # automatically transitioned to unhealthy. Transitioning to unhealthy will\n # fail the deployment and potentially roll back the job if \"auto_revert\" is\n # set to true.\n healthy_deadline = \"3m\"\n\n # The \"progress_deadline\" parameter specifies the deadline in which an\n # allocation must be marked as healthy. The deadline begins when the first\n # allocation for the deployment is created and is reset whenever an allocation\n # as part of the deployment transitions to a healthy state. If no allocation\n # transitions to the healthy state before the progress deadline, the\n # deployment is marked as failed.\n progress_deadline = \"10m\"\n\n\n # The \"canary\" parameter specifies that changes to the job that would result\n # in destructive updates should create the specified number of canaries\n # without stopping any previous allocations. Once the operator determines the\n # canaries are healthy, they can be promoted which unblocks a rolling update\n # of the remaining allocations at a rate of \"max_parallel\".\n #\n # Further, setting \"canary\" equal to the count of the task group allows\n # blue/green deployments. When the job is updated, a full set of the new\n # version is deployed and upon promotion the old version is stopped.\n canary = 1\n\n # Specifies if the job should auto-promote to the canary version when all\n # canaries become healthy during a deployment. Defaults to false which means\n # canaries must be manually updated with the nomad deployment promote\n # command.\n auto_promote = true\n\n # The \"auto_revert\" parameter specifies if the job should auto-revert to the\n # last stable job on deployment failure. A job is marked as stable if all the\n # allocations as part of its deployment were marked healthy.\n auto_revert = true\n\n }\n\n # The \"group\" stanza defines a series of tasks that should be co-located on\n # the same Nomad client. Any task within a group will be placed on the same\n # client.\n #\n # For more information and examples on the \"group\" stanza, please see\n # the online documentation at:\n #\n # https://www.nomadproject.io/docs/job-specification/group\n #\n group \"prod-group1-prometheus\" {\n # The \"count\" parameter specifies the number of the task groups that should\n # be running under this group. This value must be non-negative and defaults\n # to 1.\n count = 4\n\n # The volume stanza allows the group to specify that it requires a given\n # volume from the cluster.\n #\n # For more information and examples on the \"volume\" stanza, please see\n # the online documentation at:\n #\n # https://www.nomadproject.io/docs/job-specification/volume\n #\n \n volume \"prod-volume1-prometheus\" {\n type = \"host\"\n read_only = false\n source = \"prod-volume-data1-1\"\n }\n \n\n # The constraint allows restricting the set of eligible nodes. Constraints\n # may filter on attributes or client metadata.\n #\n # For more information and examples on the \"volume\" stanza, please see\n # the online documentation at:\n #\n # https://www.nomadproject.io/docs/job-specification/constraint\n #\n constraint {\n attribute = \"${attr.cpu.arch}\"\n operator = \"!=\"\n value = \"arm64\"\n }\n\n # The \"task\" stanza creates an individual unit of work, such as a Docker\n # container, web application, or batch processing.\n #\n # For more information and examples on the \"task\" stanza, please see\n # the online documentation at:\n #\n # https://www.nomadproject.io/docs/job-specification/task\n #\n task \"prod-task1-prometheus\" {\n # The \"driver\" parameter specifies the task driver that should be used to\n # run the task.\n driver = \"exec\"\n\n \n volume_mount {\n volume = \"prod-volume1-prometheus\"\n destination = \"/data/\"\n read_only = false\n }\n \n\n \n\n # The \"config\" stanza specifies the driver configuration, which is passed\n # directly to the driver to start the task. The details of configurations\n # are specific to each driver, so please see specific driver\n # documentation for more information.\n config {\n command = \"local/prometheus-2.24.0.linux-amd64/prometheus\"\n args = [\n \"--config.file=secrets/prometheus.yml\",\n \"--storage.tsdb.path=/data/prometheus/\",\n \"--storage.tsdb.retention.time=15d\"\n ]\n }\n\n # The artifact stanza instructs Nomad to fetch and unpack a remote resource,\n # such as a file, tarball, or binary. Nomad downloads artifacts using the\n # popular go-getter library, which permits downloading artifacts from a\n # variety of locations using a URL as the input source.\n #\n # For more information and examples on the \"artifact\" stanza, please see\n # the online documentation at:\n #\n # https://www.nomadproject.io/docs/job-specification/artifact\n #\n artifact {\n source = \"https://github.com/prometheus/prometheus/releases/download/v2.24.0/prometheus-2.24.0.linux-amd64.tar.gz\"\n }\n\n # The \"template\" stanza instructs Nomad to manage a template, such as\n # a configuration file or script. This template can optionally pull data\n # from Consul or Vault to populate runtime configuration data.\n #\n # For more information and examples on the \"template\" stanza, please see\n # the online documentation at:\n #\n # https://www.nomadproject.io/docs/job-specification/template\n #\n template {\n change_mode = \"noop\"\n change_signal = \"SIGINT\"\n destination = \"secrets/alerts.yml\"\n left_delimiter = \"{{{\"\n right_delimiter = \"}}}\"\n data = \u003c\u003cEOH\n---\ngroups:\n- name: \"Consul\"\n rules:\n - alert: ConsulServiceHealthcheckFailed\n expr: consul_catalog_service_node_healthy == 0\n for: 0m\n labels:\n severity: critical\n annotations:\n summary: \"Consul service healthcheck failed (instance {{ $labels.instance }}).\"\n description: \"Service: `{{ $labels.service_name }}` Healthcheck: `{{ $labels.service_id }}`.\"\n - alert: ConsulMissingMasterNode\n expr: consul_raft_peers \u003c 3\n for: 0m\n labels:\n severity: critical\n annotations:\n summary: \"Consul missing master node (instance {{ $labels.instance }}).\"\n description: \"Numbers of consul raft peers should be 3, in order to preserve quorum.\"\n - alert: ConsulAgentUnhealthy\n expr: consul_health_node_status{status=\"critical\"} == 1\n for: 0m\n labels:\n severity: critical\n annotations:\n summary: \"Consul agent unhealthy (instance {{ $labels.instance }}).\"\n description: \"A Consul agent is down.\"\n- name: \"Hosts\"\n rules:\n - alert: NodeDown\n expr: up == 0\n for: 0m\n labels:\n severity: critical\n annotations:\n summary: \"Prometheus target missing (instance {{ $labels.instance }}).\"\n description: \"A Prometheus target has disappeared. An exporter might be crashed.\"\n - alert: HostHighCpuLoad\n expr: 100 - (avg by(instance) (rate(node_cpu_seconds_total{mode=\"idle\"}[2m])) * 100) \u003e 80\n for: 0m\n labels:\n severity: warning\n annotations:\n summary: \"Host high CPU load (instance {{ $labels.instance }}).\"\n description: \"CPU load is \u003e 80%.\"\n - alert: HostOutOfMemory\n expr: node_memory_MemAvailable_bytes / node_memory_MemTotal_bytes * 100 \u003c 10\n for: 2m\n labels:\n severity: warning\n annotations:\n summary: \"Host out of memory (instance {{ $labels.instance }}).\"\n description: \"Node memory is filling up (\u003c 10% left).\"\n - alert: HostOomKillDetected\n expr: increase(node_vmstat_oom_kill[1m]) \u003e 0\n for: 0m\n labels:\n severity: warning\n annotations:\n summary: \"Host OOM kill detected (instance {{ $labels.instance }}).\"\n description: \"OOM kill detected.\"\n - alert: HostMemoryUnderMemoryPressure\n expr: rate(node_vmstat_pgmajfault[1m]) \u003e 1000\n for: 2m\n labels:\n severity: warning\n annotations:\n summary: \"Host memory under memory pressure (instance {{ $labels.instance }}).\"\n description: \"The node is under heavy memory pressure. High rate of major page faults.\"\n - alert: HostOutOfDiskSpace\n expr: (node_filesystem_avail_bytes * 100) / node_filesystem_size_bytes \u003c 10 and ON (instance, device, mountpoint) node_filesystem_readonly == 0\n for: 2m\n labels:\n severity: warning\n annotations:\n summary: \"Host out of disk space (instance {{ $labels.instance }}).\"\n description: \"Disk is almost full (\u003c 10% left).\"\n - alert: HostRaidDiskFailure\n expr: node_md_disks{state=\"failed\"} \u003e 0\n for: 2m\n labels:\n severity: warning\n annotations:\n summary: \"Host RAID disk failure (instance {{ $labels.instance }}).\"\n description: \"At least one device in RAID array on {{ $labels.instance }} failed. Array {{ $labels.md_device }} needs attention and possibly a disk swap.\"\n - alert: HostConntrackLimit\n expr: node_nf_conntrack_entries / node_nf_conntrack_entries_limit \u003e 0.8\n for: 5m\n labels:\n severity: warning\n annotations:\n summary: \"Host conntrack limit (instance {{ $labels.instance }}).\"\n description: \"The number of conntrack is approching limit.\"\n - alert: HostNetworkInterfaceSaturated\n expr: (rate(node_network_receive_bytes_total{device!~\"^tap.*\"}[1m]) + rate(node_network_transmit_bytes_total{device!~\"^tap.*\"}[1m])) / node_network_speed_bytes{device!~\"^tap.*\"} \u003e 0.8\n for: 1m\n labels:\n severity: warning\n annotations:\n summary: \"Host Network Interface Saturated (instance {{ $labels.instance }}).\"\n description: \"The network interface {{ $labels.interface }} on {{ $labels.instance }} is getting overloaded.\"\n - alert: HostSystemdServiceCrashed\n expr: node_systemd_unit_state{state=\"failed\"} == 1\n for: 0m\n labels:\n severity: warning\n annotations:\n summary: \"Host SystemD service crashed (instance {{ $labels.instance }}).\"\n description: \"SystemD service crashed.\"\n - alert: HostEdacCorrectableErrorsDetected\n expr: increase(node_edac_correctable_errors_total[1m]) \u003e 0\n for: 0m\n labels:\n severity: info\n annotations:\n summary: \"Host EDAC Correctable Errors detected (instance {{ $labels.instance }}).\"\n description: '{{ $labels.instance }} has had {{ printf \"%.0f\" $value }} correctable memory errors reported by EDAC in the last 5 minutes.'\n - alert: HostEdacUncorrectableErrorsDetected\n expr: node_edac_uncorrectable_errors_total \u003e 0\n for: 0m\n labels:\n severity: warning\n annotations:\n summary: \"Host EDAC Uncorrectable Errors detected (instance {{ $labels.instance }}).\"\n description: '{{ $labels.instance }} has had {{ printf \"%.0f\" $value }} uncorrectable memory errors reported by EDAC in the last 5 minutes.'\n- name: \"Min.io\"\n rules:\n - alert: MinioDiskOffline\n expr: minio_offline_disks \u003e 0\n for: 0m\n labels:\n severity: critical\n annotations:\n summary: \"Minio disk offline (instance {{ $labels.instance }})\"\n description: \"Minio disk is offline.\"\n - alert: MinioStorageSpaceExhausted\n expr: minio_disk_storage_free_bytes / 1024 / 1024 / 1024 \u003c 10\n for: 2m\n labels:\n severity: warning\n annotations:\n summary: \"Minio storage space exhausted (instance {{ $labels.instance }}).\"\n description: \"Minio storage space is low (\u003c 10 GB).\"\n- name: \"Prometheus\"\n rules:\n - alert: PrometheusConfigurationReloadFailure\n expr: prometheus_config_last_reload_successful != 1\n for: 0m\n labels:\n severity: warning\n annotations:\n summary: \"Prometheus configuration reload failure (instance {{ $labels.instance }}).\"\n description: \"Prometheus configuration reload error.\"\n - alert: PrometheusTooManyRestarts\n expr: changes(process_start_time_seconds{job=~\"prometheus|pushgateway|alertmanager\"}[15m]) \u003e 2\n for: 0m\n labels:\n severity: warning\n annotations:\n summary: \"Prometheus too many restarts (instance {{ $labels.instance }}).\"\n description: \"Prometheus has restarted more than twice in the last 15 minutes. It might be crashlooping.\"\n - alert: PrometheusAlertmanagerConfigurationReloadFailure\n expr: alertmanager_config_last_reload_successful != 1\n for: 0m\n labels:\n severity: warning\n annotations:\n summary: \"Prometheus AlertManager configuration reload failure (instance {{ $labels.instance }}).\"\n description: \"AlertManager configuration reload error.\"\n - alert: PrometheusRuleEvaluationFailures\n expr: increase(prometheus_rule_evaluation_failures_total[3m]) \u003e 0\n for: 0m\n labels:\n severity: critical\n annotations:\n summary: \"Prometheus rule evaluation failures (instance {{ $labels.instance }}).\"\n description: \"Prometheus encountered {{ $value }} rule evaluation failures, leading to potentially ignored alerts.\"\n - alert: PrometheusTargetScrapingSlow\n expr: prometheus_target_interval_length_seconds{quantile=\"0.9\"} \u003e 60\n for: 5m\n labels:\n severity: warning\n annotations:\n summary: \"Prometheus target scraping slow (instance {{ $labels.instance }}).\"\n description: \"Prometheus is scraping exporters slowly.\"\n - alert: PrometheusTsdbCompactionsFailed\n expr: increase(prometheus_tsdb_compactions_failed_total[1m]) \u003e 0\n for: 0m\n labels:\n severity: critical\n annotations:\n summary: \"Prometheus TSDB compactions failed (instance {{ $labels.instance }}).\"\n description: \"Prometheus encountered {{ $value }} TSDB compactions failures.\"\n - alert: PrometheusTsdbHeadTruncationsFailed\n expr: increase(prometheus_tsdb_head_truncations_failed_total[1m]) \u003e 0\n for: 0m\n labels:\n severity: critical\n annotations:\n summary: \"Prometheus TSDB head truncations failed (instance {{ $labels.instance }}).\"\n description: \"Prometheus encountered {{ $value }} TSDB head truncation failures.\"\n - alert: PrometheusTsdbWalCorruptions\n expr: increase(prometheus_tsdb_wal_corruptions_total[1m]) \u003e 0\n for: 0m\n labels:\n severity: critical\n annotations:\n summary: \"Prometheus TSDB WAL corruptions (instance {{ $labels.instance }}).\"\n description: \"Prometheus encountered {{ $value }} TSDB WAL corruptions.\"\n - alert: PrometheusTsdbWalTruncationsFailed\n expr: increase(prometheus_tsdb_wal_truncations_failed_total[1m]) \u003e 0\n for: 0m\n labels:\n severity: critical\n annotations:\n summary: \"Prometheus TSDB WAL truncations failed (instance {{ $labels.instance }}).\"\n description: \"Prometheus encountered {{ $value }} TSDB WAL truncation failures.\"\nEOH\n }\n\n template {\n change_mode = \"noop\"\n change_signal = \"SIGINT\"\n destination = \"secrets/prometheus.yml\"\n data = \u003c\u003cEOH\n---\nglobal:\n scrape_interval: 5s\n scrape_timeout: 5s\n evaluation_interval: 5s\n\nalerting:\n alertmanagers:\n - consul_sd_configs:\n - server: '{{ env \"NOMAD_IP_prometheus\" }}:8500'\n services: [ 'alertmanager' ]\n\nrule_files:\n - 'alerts.yml'\n\nscrape_configs:\n\n - job_name: 'Nomad Cluster'\n consul_sd_configs:\n - server: '{{ env \"NOMAD_IP_prometheus\" }}:8500'\n services: [ 'nomad-client', 'nomad' ]\n relabel_configs:\n - source_labels: [__meta_consul_tags]\n regex: '(.*)http(.*)'\n action: keep\n metrics_path: /v1/metrics\n params:\n format: [ 'prometheus' ]\n\n - job_name: 'Consul Cluster'\n static_configs:\n - targets: [ '10.30.51.30:8500', '10.30.51.32:8500', '10.30.51.33:8500' ]\n metrics_path: /v1/agent/metrics\n params:\n format: [ 'prometheus' ]\n\n - job_name: 'Alertmanager'\n consul_sd_configs:\n - server: '{{ env \"NOMAD_IP_prometheus\" }}:8500'\n services: [ 'alertmanager' ]\n\n - job_name: 'Blackbox Exporter (icmp)'\n static_configs:\n - targets: [ 'gerrit.fd.io' ]\n - targets: [ 'jenkins.fd.io' ]\n - targets: [ '10.30.51.32' ]\n params:\n module: [ 'icmp_v4' ]\n relabel_configs:\n - source_labels: [__address__]\n target_label: __param_target\n - source_labels: [__param_target]\n target_label: instance\n - target_label: __address__\n replacement: localhost:9115\n metrics_path: /probe\n\n - job_name: 'Blackbox Exporter (http)'\n static_configs:\n - targets: [ 'gerrit.fd.io' ]\n - targets: [ 'jenkins.fd.io' ]\n params:\n module: [ 'http_2xx' ]\n relabel_configs:\n - source_labels: [__address__]\n target_label: __param_target\n - source_labels: [__param_target]\n target_label: instance\n - target_label: __address__\n replacement: localhost:9115\n metrics_path: /probe\n\n - job_name: 'cAdvisor Exporter'\n consul_sd_configs:\n - server: '{{ env \"NOMAD_IP_prometheus\" }}:8500'\n services: [ 'cadvisorexporter' ]\n\n - job_name: 'Grafana'\n consul_sd_configs:\n - server: '{{ env \"NOMAD_IP_prometheus\" }}:8500'\n services: [ 'grafana' ]\n\n - job_name: 'Node Exporter'\n consul_sd_configs:\n - server: '{{ env \"NOMAD_IP_prometheus\" }}:8500'\n services: [ 'nodeexporter' ]\n\n - job_name: 'Prometheus'\n consul_sd_configs:\n - server: '{{ env \"NOMAD_IP_prometheus\" }}:8500'\n services: [ 'prometheus' ]\n\n - job_name: 'Minio'\n bearer_token: eyJhbGciOiJIUzUxMiIsInR5cCI6IkpXVCJ9.eyJleHAiOjQ3NjQ1ODEzMzcsImlzcyI6InByb21ldGhldXMiLCJzdWIiOiJtaW5pbyJ9.oeTw3EIaiFmlDikrHXWiWXMH2vxLfDLkfjEC7G2N3M_keH_xyA_l2ofLLNYtopa_3GCEZnxLQdPuFZrmgpkDWg\n consul_sd_configs:\n - server: '{{ env \"NOMAD_IP_prometheus\" }}:8500'\n services: [ 'storage' ]\n metrics_path: /minio/prometheus/metrics\nEOH\n }\n\n # The service stanza instructs Nomad to register a service with Consul.\n #\n # For more information and examples on the \"task\" stanza, please see\n # the online documentation at:\n #\n # https://www.nomadproject.io/docs/job-specification/service\n #\n service {\n name = \"prometheus\"\n port = \"prometheus\"\n tags = [ \"prometheus${NOMAD_ALLOC_INDEX}\" ]\n check {\n name = \"Prometheus Check Live\"\n type = \"http\"\n path = \"/-/healthy\"\n interval = \"10s\"\n timeout = \"2s\"\n }\n }\n\n # The \"resources\" stanza describes the requirements a task needs to\n # execute. Resource requirements include memory, network, cpu, and more.\n # This ensures the task will execute on a machine that contains enough\n # resource capacity.\n #\n # For more information and examples on the \"resources\" stanza, please see\n # the online documentation at:\n #\n # https://www.nomadproject.io/docs/job-specification/resources\n #\n resources {\n cpu = 2000\n memory = 8192\n # The network stanza specifies the networking requirements for the task\n # group, including the network mode and port allocations. When scheduling\n # jobs in Nomad they are provisioned across your fleet of machines along\n # with other jobs and services. Because you don't know in advance what host\n # your job will be provisioned on, Nomad will provide your tasks with\n # network configuration when they start up.\n #\n # For more information and examples on the \"template\" stanza, please see\n # the online documentation at:\n #\n # https://www.nomadproject.io/docs/job-specification/network\n #\n network {\n port \"prometheus\" {\n static = 9090\n }\n }\n }\n }\n }\n}",
+ "template": "job \"${job_name}\" {\n # The \"region\" parameter specifies the region in which to execute the job.\n # If omitted, this inherits the default region name of \"global\".\n # region = \"global\"\n #\n # The \"datacenters\" parameter specifies the list of datacenters which should\n # be considered when placing this task. This must be provided.\n datacenters = \"${datacenters}\"\n\n # The \"type\" parameter controls the type of job, which impacts the scheduler's\n # decision on placement. This configuration is optional and defaults to\n # \"service\". For a full list of job types and their differences, please see\n # the online documentation.\n #\n # For more information, please see the online documentation at:\n #\n # https://www.nomadproject.io/docs/jobspec/schedulers\n #\n type = \"service\"\n\n update {\n # The \"max_parallel\" parameter specifies the maximum number of updates to\n # perform in parallel. In this case, this specifies to update a single task\n # at a time.\n max_parallel = 1\n\n health_check = \"checks\"\n\n # The \"min_healthy_time\" parameter specifies the minimum time the allocation\n # must be in the healthy state before it is marked as healthy and unblocks\n # further allocations from being updated.\n min_healthy_time = \"10s\"\n\n # The \"healthy_deadline\" parameter specifies the deadline in which the\n # allocation must be marked as healthy after which the allocation is\n # automatically transitioned to unhealthy. Transitioning to unhealthy will\n # fail the deployment and potentially roll back the job if \"auto_revert\" is\n # set to true.\n healthy_deadline = \"3m\"\n\n # The \"progress_deadline\" parameter specifies the deadline in which an\n # allocation must be marked as healthy. The deadline begins when the first\n # allocation for the deployment is created and is reset whenever an allocation\n # as part of the deployment transitions to a healthy state. If no allocation\n # transitions to the healthy state before the progress deadline, the\n # deployment is marked as failed.\n progress_deadline = \"10m\"\n\n%{ if use_canary }\n # The \"canary\" parameter specifies that changes to the job that would result\n # in destructive updates should create the specified number of canaries\n # without stopping any previous allocations. Once the operator determines the\n # canaries are healthy, they can be promoted which unblocks a rolling update\n # of the remaining allocations at a rate of \"max_parallel\".\n #\n # Further, setting \"canary\" equal to the count of the task group allows\n # blue/green deployments. When the job is updated, a full set of the new\n # version is deployed and upon promotion the old version is stopped.\n canary = 1\n\n # Specifies if the job should auto-promote to the canary version when all\n # canaries become healthy during a deployment. Defaults to false which means\n # canaries must be manually updated with the nomad deployment promote\n # command.\n auto_promote = true\n\n # The \"auto_revert\" parameter specifies if the job should auto-revert to the\n # last stable job on deployment failure. A job is marked as stable if all the\n # allocations as part of its deployment were marked healthy.\n auto_revert = true\n%{ endif }\n }\n\n # The \"group\" stanza defines a series of tasks that should be co-located on\n # the same Nomad client. Any task within a group will be placed on the same\n # client.\n #\n # For more information and examples on the \"group\" stanza, please see\n # the online documentation at:\n #\n # https://www.nomadproject.io/docs/job-specification/group\n #\n group \"prod-group1-${service_name}\" {\n # The \"count\" parameter specifies the number of the task groups that should\n # be running under this group. This value must be non-negative and defaults\n # to 1.\n count = ${group_count}\n\n # The volume stanza allows the group to specify that it requires a given\n # volume from the cluster.\n #\n # For more information and examples on the \"volume\" stanza, please see\n # the online documentation at:\n #\n # https://www.nomadproject.io/docs/job-specification/volume\n #\n %{ if use_host_volume }\n volume \"prod-volume1-${service_name}\" {\n type = \"host\"\n read_only = false\n source = \"${host_volume}\"\n }\n %{ endif }\n\n # The constraint allows restricting the set of eligible nodes. Constraints\n # may filter on attributes or client metadata.\n #\n # For more information and examples on the \"volume\" stanza, please see\n # the online documentation at:\n #\n # https://www.nomadproject.io/docs/job-specification/constraint\n #\n constraint {\n attribute = \"$${attr.cpu.arch}\"\n operator = \"!=\"\n value = \"arm64\"\n }\n\n # The \"task\" stanza creates an individual unit of work, such as a Docker\n # container, web application, or batch processing.\n #\n # For more information and examples on the \"task\" stanza, please see\n # the online documentation at:\n #\n # https://www.nomadproject.io/docs/job-specification/task\n #\n task \"prod-task1-${service_name}\" {\n # The \"driver\" parameter specifies the task driver that should be used to\n # run the task.\n driver = \"exec\"\n\n %{ if use_host_volume }\n volume_mount {\n volume = \"prod-volume1-${service_name}\"\n destination = \"${data_dir}\"\n read_only = false\n }\n %{ endif }\n\n %{ if use_vault_provider }\n vault {\n policies = \"${vault_kv_policy_name}\"\n }\n %{ endif }\n\n # The \"config\" stanza specifies the driver configuration, which is passed\n # directly to the driver to start the task. The details of configurations\n # are specific to each driver, so please see specific driver\n # documentation for more information.\n config {\n command = \"local/prometheus-${version}.linux-amd64/prometheus\"\n args = [\n \"--config.file=secrets/prometheus.yml\",\n \"--storage.tsdb.path=${data_dir}prometheus/\",\n \"--storage.tsdb.retention.time=15d\"\n ]\n }\n\n # The artifact stanza instructs Nomad to fetch and unpack a remote resource,\n # such as a file, tarball, or binary. Nomad downloads artifacts using the\n # popular go-getter library, which permits downloading artifacts from a\n # variety of locations using a URL as the input source.\n #\n # For more information and examples on the \"artifact\" stanza, please see\n # the online documentation at:\n #\n # https://www.nomadproject.io/docs/job-specification/artifact\n #\n artifact {\n source = \"${url}\"\n }\n\n # The \"template\" stanza instructs Nomad to manage a template, such as\n # a configuration file or script. This template can optionally pull data\n # from Consul or Vault to populate runtime configuration data.\n #\n # For more information and examples on the \"template\" stanza, please see\n # the online documentation at:\n #\n # https://www.nomadproject.io/docs/job-specification/template\n #\n template {\n change_mode = \"noop\"\n change_signal = \"SIGINT\"\n destination = \"secrets/alerts.yml\"\n left_delimiter = \"{{{\"\n right_delimiter = \"}}}\"\n data = \u003c\u003cEOH\n---\ngroups:\n- name: \"Consul\"\n rules:\n - alert: ConsulServiceHealthcheckFailed\n expr: consul_catalog_service_node_healthy == 0\n for: 0m\n labels:\n severity: critical\n annotations:\n summary: \"Consul service healthcheck failed (instance {{ $labels.instance }}).\"\n description: \"Service: `{{ $labels.service_name }}` Healthcheck: `{{ $labels.service_id }}`.\"\n - alert: ConsulMissingMasterNode\n expr: consul_raft_peers \u003c 3\n for: 0m\n labels:\n severity: critical\n annotations:\n summary: \"Consul missing master node (instance {{ $labels.instance }}).\"\n description: \"Numbers of consul raft peers should be 3, in order to preserve quorum.\"\n - alert: ConsulAgentUnhealthy\n expr: consul_health_node_status{status=\"critical\"} == 1\n for: 0m\n labels:\n severity: critical\n annotations:\n summary: \"Consul agent unhealthy (instance {{ $labels.instance }}).\"\n description: \"A Consul agent is down.\"\n- name: \"Hosts\"\n rules:\n - alert: NodeDown\n expr: up == 0\n for: 0m\n labels:\n severity: critical\n annotations:\n summary: \"Prometheus target missing (instance {{ $labels.instance }}).\"\n description: \"A Prometheus target has disappeared. An exporter might be crashed.\"\n - alert: HostHighCpuLoad\n expr: 100 - (avg by(instance) (rate(node_cpu_seconds_total{mode=\"idle\"}[2m])) * 100) \u003e 80\n for: 0m\n labels:\n severity: warning\n annotations:\n summary: \"Host high CPU load (instance {{ $labels.instance }}).\"\n description: \"CPU load is \u003e 80%.\"\n - alert: HostOutOfMemory\n expr: node_memory_MemAvailable_bytes / node_memory_MemTotal_bytes * 100 \u003c 10\n for: 2m\n labels:\n severity: warning\n annotations:\n summary: \"Host out of memory (instance {{ $labels.instance }}).\"\n description: \"Node memory is filling up (\u003c 10% left).\"\n - alert: HostOomKillDetected\n expr: increase(node_vmstat_oom_kill[1m]) \u003e 0\n for: 0m\n labels:\n severity: warning\n annotations:\n summary: \"Host OOM kill detected (instance {{ $labels.instance }}).\"\n description: \"OOM kill detected.\"\n - alert: HostMemoryUnderMemoryPressure\n expr: rate(node_vmstat_pgmajfault[1m]) \u003e 1000\n for: 2m\n labels:\n severity: warning\n annotations:\n summary: \"Host memory under memory pressure (instance {{ $labels.instance }}).\"\n description: \"The node is under heavy memory pressure. High rate of major page faults.\"\n - alert: HostOutOfDiskSpace\n expr: (node_filesystem_avail_bytes * 100) / node_filesystem_size_bytes \u003c 10 and ON (instance, device, mountpoint) node_filesystem_readonly == 0\n for: 2m\n labels:\n severity: warning\n annotations:\n summary: \"Host out of disk space (instance {{ $labels.instance }}).\"\n description: \"Disk is almost full (\u003c 10% left).\"\n - alert: HostRaidDiskFailure\n expr: node_md_disks{state=\"failed\"} \u003e 0\n for: 2m\n labels:\n severity: warning\n annotations:\n summary: \"Host RAID disk failure (instance {{ $labels.instance }}).\"\n description: \"At least one device in RAID array on {{ $labels.instance }} failed. Array {{ $labels.md_device }} needs attention and possibly a disk swap.\"\n - alert: HostConntrackLimit\n expr: node_nf_conntrack_entries / node_nf_conntrack_entries_limit \u003e 0.8\n for: 5m\n labels:\n severity: warning\n annotations:\n summary: \"Host conntrack limit (instance {{ $labels.instance }}).\"\n description: \"The number of conntrack is approching limit.\"\n - alert: HostNetworkInterfaceSaturated\n expr: (rate(node_network_receive_bytes_total{device!~\"^tap.*\"}[1m]) + rate(node_network_transmit_bytes_total{device!~\"^tap.*\"}[1m])) / node_network_speed_bytes{device!~\"^tap.*\"} \u003e 0.8\n for: 1m\n labels:\n severity: warning\n annotations:\n summary: \"Host Network Interface Saturated (instance {{ $labels.instance }}).\"\n description: \"The network interface {{ $labels.interface }} on {{ $labels.instance }} is getting overloaded.\"\n - alert: HostSystemdServiceCrashed\n expr: node_systemd_unit_state{state=\"failed\"} == 1\n for: 0m\n labels:\n severity: warning\n annotations:\n summary: \"Host SystemD service crashed (instance {{ $labels.instance }}).\"\n description: \"SystemD service crashed.\"\n - alert: HostEdacCorrectableErrorsDetected\n expr: increase(node_edac_correctable_errors_total[1m]) \u003e 0\n for: 0m\n labels:\n severity: info\n annotations:\n summary: \"Host EDAC Correctable Errors detected (instance {{ $labels.instance }}).\"\n description: '{{ $labels.instance }} has had {{ printf \"%.0f\" $value }} correctable memory errors reported by EDAC in the last 5 minutes.'\n - alert: HostEdacUncorrectableErrorsDetected\n expr: node_edac_uncorrectable_errors_total \u003e 0\n for: 0m\n labels:\n severity: warning\n annotations:\n summary: \"Host EDAC Uncorrectable Errors detected (instance {{ $labels.instance }}).\"\n description: '{{ $labels.instance }} has had {{ printf \"%.0f\" $value }} uncorrectable memory errors reported by EDAC in the last 5 minutes.'\n- name: \"Min.io\"\n rules:\n - alert: MinioDiskOffline\n expr: minio_offline_disks \u003e 0\n for: 0m\n labels:\n severity: critical\n annotations:\n summary: \"Minio disk offline (instance {{ $labels.instance }})\"\n description: \"Minio disk is offline.\"\n - alert: MinioStorageSpaceExhausted\n expr: minio_disk_storage_free_bytes / 1024 / 1024 / 1024 \u003c 10\n for: 2m\n labels:\n severity: warning\n annotations:\n summary: \"Minio storage space exhausted (instance {{ $labels.instance }}).\"\n description: \"Minio storage space is low (\u003c 10 GB).\"\n- name: \"Prometheus\"\n rules:\n - alert: PrometheusConfigurationReloadFailure\n expr: prometheus_config_last_reload_successful != 1\n for: 0m\n labels:\n severity: warning\n annotations:\n summary: \"Prometheus configuration reload failure (instance {{ $labels.instance }}).\"\n description: \"Prometheus configuration reload error.\"\n - alert: PrometheusTooManyRestarts\n expr: changes(process_start_time_seconds{job=~\"prometheus|pushgateway|alertmanager\"}[15m]) \u003e 2\n for: 0m\n labels:\n severity: warning\n annotations:\n summary: \"Prometheus too many restarts (instance {{ $labels.instance }}).\"\n description: \"Prometheus has restarted more than twice in the last 15 minutes. It might be crashlooping.\"\n - alert: PrometheusAlertmanagerConfigurationReloadFailure\n expr: alertmanager_config_last_reload_successful != 1\n for: 0m\n labels:\n severity: warning\n annotations:\n summary: \"Prometheus AlertManager configuration reload failure (instance {{ $labels.instance }}).\"\n description: \"AlertManager configuration reload error.\"\n - alert: PrometheusRuleEvaluationFailures\n expr: increase(prometheus_rule_evaluation_failures_total[3m]) \u003e 0\n for: 0m\n labels:\n severity: critical\n annotations:\n summary: \"Prometheus rule evaluation failures (instance {{ $labels.instance }}).\"\n description: \"Prometheus encountered {{ $value }} rule evaluation failures, leading to potentially ignored alerts.\"\n - alert: PrometheusTargetScrapingSlow\n expr: prometheus_target_interval_length_seconds{quantile=\"0.9\"} \u003e 60\n for: 5m\n labels:\n severity: warning\n annotations:\n summary: \"Prometheus target scraping slow (instance {{ $labels.instance }}).\"\n description: \"Prometheus is scraping exporters slowly.\"\n - alert: PrometheusTsdbCompactionsFailed\n expr: increase(prometheus_tsdb_compactions_failed_total[1m]) \u003e 0\n for: 0m\n labels:\n severity: critical\n annotations:\n summary: \"Prometheus TSDB compactions failed (instance {{ $labels.instance }}).\"\n description: \"Prometheus encountered {{ $value }} TSDB compactions failures.\"\n - alert: PrometheusTsdbHeadTruncationsFailed\n expr: increase(prometheus_tsdb_head_truncations_failed_total[1m]) \u003e 0\n for: 0m\n labels:\n severity: critical\n annotations:\n summary: \"Prometheus TSDB head truncations failed (instance {{ $labels.instance }}).\"\n description: \"Prometheus encountered {{ $value }} TSDB head truncation failures.\"\n - alert: PrometheusTsdbWalCorruptions\n expr: increase(prometheus_tsdb_wal_corruptions_total[1m]) \u003e 0\n for: 0m\n labels:\n severity: critical\n annotations:\n summary: \"Prometheus TSDB WAL corruptions (instance {{ $labels.instance }}).\"\n description: \"Prometheus encountered {{ $value }} TSDB WAL corruptions.\"\n - alert: PrometheusTsdbWalTruncationsFailed\n expr: increase(prometheus_tsdb_wal_truncations_failed_total[1m]) \u003e 0\n for: 0m\n labels:\n severity: critical\n annotations:\n summary: \"Prometheus TSDB WAL truncations failed (instance {{ $labels.instance }}).\"\n description: \"Prometheus encountered {{ $value }} TSDB WAL truncation failures.\"\nEOH\n }\n\n template {\n change_mode = \"noop\"\n change_signal = \"SIGINT\"\n destination = \"secrets/prometheus.yml\"\n data = \u003c\u003cEOH\n---\nglobal:\n scrape_interval: 5s\n scrape_timeout: 5s\n evaluation_interval: 5s\n\nalerting:\n alertmanagers:\n - consul_sd_configs:\n - server: '{{ env \"NOMAD_IP_prometheus\" }}:8500'\n services: [ 'alertmanager' ]\n\nrule_files:\n - 'alerts.yml'\n\nscrape_configs:\n\n - job_name: 'Nomad Cluster'\n consul_sd_configs:\n - server: '{{ env \"NOMAD_IP_prometheus\" }}:8500'\n services: [ 'nomad-client', 'nomad' ]\n relabel_configs:\n - source_labels: [__meta_consul_tags]\n regex: '(.*)http(.*)'\n action: keep\n metrics_path: /v1/metrics\n params:\n format: [ 'prometheus' ]\n\n - job_name: 'Consul Cluster'\n static_configs:\n - targets: [ '10.30.51.30:8500', '10.30.51.32:8500', '10.30.51.33:8500' ]\n metrics_path: /v1/agent/metrics\n params:\n format: [ 'prometheus' ]\n\n - job_name: 'Alertmanager'\n consul_sd_configs:\n - server: '{{ env \"NOMAD_IP_prometheus\" }}:8500'\n services: [ 'alertmanager' ]\n\n - job_name: 'Blackbox Exporter (icmp)'\n static_configs:\n - targets: [ 'gerrit.fd.io' ]\n - targets: [ 'jenkins.fd.io' ]\n - targets: [ '10.30.51.32' ]\n params:\n module: [ 'icmp_v4' ]\n relabel_configs:\n - source_labels: [__address__]\n target_label: __param_target\n - source_labels: [__param_target]\n target_label: instance\n - target_label: __address__\n replacement: localhost:9115\n metrics_path: /probe\n\n - job_name: 'Blackbox Exporter (http)'\n static_configs:\n - targets: [ 'gerrit.fd.io' ]\n - targets: [ 'jenkins.fd.io' ]\n params:\n module: [ 'http_2xx' ]\n relabel_configs:\n - source_labels: [__address__]\n target_label: __param_target\n - source_labels: [__param_target]\n target_label: instance\n - target_label: __address__\n replacement: localhost:9115\n metrics_path: /probe\n\n - job_name: 'cAdvisor Exporter'\n consul_sd_configs:\n - server: '{{ env \"NOMAD_IP_prometheus\" }}:8500'\n services: [ 'cadvisorexporter' ]\n\n - job_name: 'Grafana'\n consul_sd_configs:\n - server: '{{ env \"NOMAD_IP_prometheus\" }}:8500'\n services: [ 'grafana' ]\n\n - job_name: 'Node Exporter'\n consul_sd_configs:\n - server: '{{ env \"NOMAD_IP_prometheus\" }}:8500'\n services: [ 'nodeexporter' ]\n\n - job_name: 'Prometheus'\n consul_sd_configs:\n - server: '{{ env \"NOMAD_IP_prometheus\" }}:8500'\n services: [ 'prometheus' ]\n\n - job_name: 'Minio'\n bearer_token: eyJhbGciOiJIUzUxMiIsInR5cCI6IkpXVCJ9.eyJleHAiOjQ3NjQ1ODEzMzcsImlzcyI6InByb21ldGhldXMiLCJzdWIiOiJtaW5pbyJ9.oeTw3EIaiFmlDikrHXWiWXMH2vxLfDLkfjEC7G2N3M_keH_xyA_l2ofLLNYtopa_3GCEZnxLQdPuFZrmgpkDWg\n consul_sd_configs:\n - server: '{{ env \"NOMAD_IP_prometheus\" }}:8500'\n services: [ 'storage' ]\n metrics_path: /minio/prometheus/metrics\nEOH\n }\n\n # The service stanza instructs Nomad to register a service with Consul.\n #\n # For more information and examples on the \"task\" stanza, please see\n # the online documentation at:\n #\n # https://www.nomadproject.io/docs/job-specification/service\n #\n service {\n name = \"${service_name}\"\n port = \"${service_name}\"\n tags = [ \"${service_name}$${NOMAD_ALLOC_INDEX}\" ]\n check {\n name = \"Prometheus Check Live\"\n type = \"http\"\n path = \"/-/healthy\"\n interval = \"10s\"\n timeout = \"2s\"\n }\n }\n\n # The \"resources\" stanza describes the requirements a task needs to\n # execute. Resource requirements include memory, network, cpu, and more.\n # This ensures the task will execute on a machine that contains enough\n # resource capacity.\n #\n # For more information and examples on the \"resources\" stanza, please see\n # the online documentation at:\n #\n # https://www.nomadproject.io/docs/job-specification/resources\n #\n resources {\n cpu = ${cpu}\n memory = ${mem}\n # The network stanza specifies the networking requirements for the task\n # group, including the network mode and port allocations. When scheduling\n # jobs in Nomad they are provisioned across your fleet of machines along\n # with other jobs and services. Because you don't know in advance what host\n # your job will be provisioned on, Nomad will provide your tasks with\n # network configuration when they start up.\n #\n # For more information and examples on the \"template\" stanza, please see\n # the online documentation at:\n #\n # https://www.nomadproject.io/docs/job-specification/network\n #\n network {\n port \"${service_name}\" {\n static = ${port}\n }\n }\n }\n }\n }\n}",
+ "vars": {
+ "cpu": "2000",
+ "data_dir": "/data/",
+ "datacenters": "yul1",
+ "group_count": "4",
+ "host_volume": "prod-volume-data1-1",
+ "job_name": "prod-prometheus",
+ "mem": "8192",
+ "port": "9090",
+ "service_name": "prometheus",
+ "url": "https://github.com/prometheus/prometheus/releases/download/v2.24.0/prometheus-2.24.0.linux-amd64.tar.gz",
+ "use_canary": "true",
+ "use_host_volume": "true",
+ "use_vault_provider": "false",
+ "version": "2.24.0"
+ }
+ },
+ "sensitive_attributes": []
+ }
+ ]
+ },
+ {
+ "module": "module.prometheus",
+ "mode": "managed",
+ "type": "nomad_job",
+ "name": "nomad_job_prometheus",
+ "provider": "provider[\"registry.terraform.io/hashicorp/nomad\"].yul1",
+ "instances": [
+ {
+ "schema_version": 0,
+ "attributes": {
+ "allocation_ids": [
+ "79c4c6b7-e8e4-79e7-9dae-9a26142cd94c",
+ "2e11f44e-814e-db74-a9ee-8bcbebb1fa0b",
+ "83f9c9aa-f4f2-b832-a937-aaee4181493e",
+ "231f1338-a871-b44e-d545-e8af01d7eebe"
+ ],
+ "datacenters": [
+ "yul1"
+ ],
+ "deployment_id": "4cbf8370-f2d7-16c2-d500-f6495d43537d",
+ "deployment_status": "successful",
+ "deregister_on_destroy": true,
+ "deregister_on_id_change": true,
+ "detach": false,
+ "id": "prod-prometheus",
+ "jobspec": "job \"prod-prometheus\" {\n # The \"region\" parameter specifies the region in which to execute the job.\n # If omitted, this inherits the default region name of \"global\".\n # region = \"global\"\n #\n # The \"datacenters\" parameter specifies the list of datacenters which should\n # be considered when placing this task. This must be provided.\n datacenters = \"yul1\"\n\n # The \"type\" parameter controls the type of job, which impacts the scheduler's\n # decision on placement. This configuration is optional and defaults to\n # \"service\". For a full list of job types and their differences, please see\n # the online documentation.\n #\n # For more information, please see the online documentation at:\n #\n # https://www.nomadproject.io/docs/jobspec/schedulers\n #\n type = \"service\"\n\n update {\n # The \"max_parallel\" parameter specifies the maximum number of updates to\n # perform in parallel. In this case, this specifies to update a single task\n # at a time.\n max_parallel = 1\n\n health_check = \"checks\"\n\n # The \"min_healthy_time\" parameter specifies the minimum time the allocation\n # must be in the healthy state before it is marked as healthy and unblocks\n # further allocations from being updated.\n min_healthy_time = \"10s\"\n\n # The \"healthy_deadline\" parameter specifies the deadline in which the\n # allocation must be marked as healthy after which the allocation is\n # automatically transitioned to unhealthy. Transitioning to unhealthy will\n # fail the deployment and potentially roll back the job if \"auto_revert\" is\n # set to true.\n healthy_deadline = \"3m\"\n\n # The \"progress_deadline\" parameter specifies the deadline in which an\n # allocation must be marked as healthy. The deadline begins when the first\n # allocation for the deployment is created and is reset whenever an allocation\n # as part of the deployment transitions to a healthy state. If no allocation\n # transitions to the healthy state before the progress deadline, the\n # deployment is marked as failed.\n progress_deadline = \"10m\"\n\n\n # The \"canary\" parameter specifies that changes to the job that would result\n # in destructive updates should create the specified number of canaries\n # without stopping any previous allocations. Once the operator determines the\n # canaries are healthy, they can be promoted which unblocks a rolling update\n # of the remaining allocations at a rate of \"max_parallel\".\n #\n # Further, setting \"canary\" equal to the count of the task group allows\n # blue/green deployments. When the job is updated, a full set of the new\n # version is deployed and upon promotion the old version is stopped.\n canary = 1\n\n # Specifies if the job should auto-promote to the canary version when all\n # canaries become healthy during a deployment. Defaults to false which means\n # canaries must be manually updated with the nomad deployment promote\n # command.\n auto_promote = true\n\n # The \"auto_revert\" parameter specifies if the job should auto-revert to the\n # last stable job on deployment failure. A job is marked as stable if all the\n # allocations as part of its deployment were marked healthy.\n auto_revert = true\n\n }\n\n # The \"group\" stanza defines a series of tasks that should be co-located on\n # the same Nomad client. Any task within a group will be placed on the same\n # client.\n #\n # For more information and examples on the \"group\" stanza, please see\n # the online documentation at:\n #\n # https://www.nomadproject.io/docs/job-specification/group\n #\n group \"prod-group1-prometheus\" {\n # The \"count\" parameter specifies the number of the task groups that should\n # be running under this group. This value must be non-negative and defaults\n # to 1.\n count = 4\n\n # The volume stanza allows the group to specify that it requires a given\n # volume from the cluster.\n #\n # For more information and examples on the \"volume\" stanza, please see\n # the online documentation at:\n #\n # https://www.nomadproject.io/docs/job-specification/volume\n #\n \n volume \"prod-volume1-prometheus\" {\n type = \"host\"\n read_only = false\n source = \"prod-volume-data1-1\"\n }\n \n\n # The constraint allows restricting the set of eligible nodes. Constraints\n # may filter on attributes or client metadata.\n #\n # For more information and examples on the \"volume\" stanza, please see\n # the online documentation at:\n #\n # https://www.nomadproject.io/docs/job-specification/constraint\n #\n constraint {\n attribute = \"${attr.cpu.arch}\"\n operator = \"!=\"\n value = \"arm64\"\n }\n\n # The \"task\" stanza creates an individual unit of work, such as a Docker\n # container, web application, or batch processing.\n #\n # For more information and examples on the \"task\" stanza, please see\n # the online documentation at:\n #\n # https://www.nomadproject.io/docs/job-specification/task\n #\n task \"prod-task1-prometheus\" {\n # The \"driver\" parameter specifies the task driver that should be used to\n # run the task.\n driver = \"exec\"\n\n \n volume_mount {\n volume = \"prod-volume1-prometheus\"\n destination = \"/data/\"\n read_only = false\n }\n \n\n \n\n # The \"config\" stanza specifies the driver configuration, which is passed\n # directly to the driver to start the task. The details of configurations\n # are specific to each driver, so please see specific driver\n # documentation for more information.\n config {\n command = \"local/prometheus-2.24.0.linux-amd64/prometheus\"\n args = [\n \"--config.file=secrets/prometheus.yml\",\n \"--storage.tsdb.path=/data/prometheus/\",\n \"--storage.tsdb.retention.time=15d\"\n ]\n }\n\n # The artifact stanza instructs Nomad to fetch and unpack a remote resource,\n # such as a file, tarball, or binary. Nomad downloads artifacts using the\n # popular go-getter library, which permits downloading artifacts from a\n # variety of locations using a URL as the input source.\n #\n # For more information and examples on the \"artifact\" stanza, please see\n # the online documentation at:\n #\n # https://www.nomadproject.io/docs/job-specification/artifact\n #\n artifact {\n source = \"https://github.com/prometheus/prometheus/releases/download/v2.24.0/prometheus-2.24.0.linux-amd64.tar.gz\"\n }\n\n # The \"template\" stanza instructs Nomad to manage a template, such as\n # a configuration file or script. This template can optionally pull data\n # from Consul or Vault to populate runtime configuration data.\n #\n # For more information and examples on the \"template\" stanza, please see\n # the online documentation at:\n #\n # https://www.nomadproject.io/docs/job-specification/template\n #\n template {\n change_mode = \"noop\"\n change_signal = \"SIGINT\"\n destination = \"secrets/alerts.yml\"\n left_delimiter = \"{{{\"\n right_delimiter = \"}}}\"\n data = \u003c\u003cEOH\n---\ngroups:\n- name: \"Consul\"\n rules:\n - alert: ConsulServiceHealthcheckFailed\n expr: consul_catalog_service_node_healthy == 0\n for: 0m\n labels:\n severity: critical\n annotations:\n summary: \"Consul service healthcheck failed (instance {{ $labels.instance }}).\"\n description: \"Service: `{{ $labels.service_name }}` Healthcheck: `{{ $labels.service_id }}`.\"\n - alert: ConsulMissingMasterNode\n expr: consul_raft_peers \u003c 3\n for: 0m\n labels:\n severity: critical\n annotations:\n summary: \"Consul missing master node (instance {{ $labels.instance }}).\"\n description: \"Numbers of consul raft peers should be 3, in order to preserve quorum.\"\n - alert: ConsulAgentUnhealthy\n expr: consul_health_node_status{status=\"critical\"} == 1\n for: 0m\n labels:\n severity: critical\n annotations:\n summary: \"Consul agent unhealthy (instance {{ $labels.instance }}).\"\n description: \"A Consul agent is down.\"\n- name: \"Hosts\"\n rules:\n - alert: NodeDown\n expr: up == 0\n for: 0m\n labels:\n severity: critical\n annotations:\n summary: \"Prometheus target missing (instance {{ $labels.instance }}).\"\n description: \"A Prometheus target has disappeared. An exporter might be crashed.\"\n - alert: HostHighCpuLoad\n expr: 100 - (avg by(instance) (rate(node_cpu_seconds_total{mode=\"idle\"}[2m])) * 100) \u003e 80\n for: 0m\n labels:\n severity: warning\n annotations:\n summary: \"Host high CPU load (instance {{ $labels.instance }}).\"\n description: \"CPU load is \u003e 80%.\"\n - alert: HostOutOfMemory\n expr: node_memory_MemAvailable_bytes / node_memory_MemTotal_bytes * 100 \u003c 10\n for: 2m\n labels:\n severity: warning\n annotations:\n summary: \"Host out of memory (instance {{ $labels.instance }}).\"\n description: \"Node memory is filling up (\u003c 10% left).\"\n - alert: HostOomKillDetected\n expr: increase(node_vmstat_oom_kill[1m]) \u003e 0\n for: 0m\n labels:\n severity: warning\n annotations:\n summary: \"Host OOM kill detected (instance {{ $labels.instance }}).\"\n description: \"OOM kill detected.\"\n - alert: HostMemoryUnderMemoryPressure\n expr: rate(node_vmstat_pgmajfault[1m]) \u003e 1000\n for: 2m\n labels:\n severity: warning\n annotations:\n summary: \"Host memory under memory pressure (instance {{ $labels.instance }}).\"\n description: \"The node is under heavy memory pressure. High rate of major page faults.\"\n - alert: HostOutOfDiskSpace\n expr: (node_filesystem_avail_bytes * 100) / node_filesystem_size_bytes \u003c 10 and ON (instance, device, mountpoint) node_filesystem_readonly == 0\n for: 2m\n labels:\n severity: warning\n annotations:\n summary: \"Host out of disk space (instance {{ $labels.instance }}).\"\n description: \"Disk is almost full (\u003c 10% left).\"\n - alert: HostRaidDiskFailure\n expr: node_md_disks{state=\"failed\"} \u003e 0\n for: 2m\n labels:\n severity: warning\n annotations:\n summary: \"Host RAID disk failure (instance {{ $labels.instance }}).\"\n description: \"At least one device in RAID array on {{ $labels.instance }} failed. Array {{ $labels.md_device }} needs attention and possibly a disk swap.\"\n - alert: HostConntrackLimit\n expr: node_nf_conntrack_entries / node_nf_conntrack_entries_limit \u003e 0.8\n for: 5m\n labels:\n severity: warning\n annotations:\n summary: \"Host conntrack limit (instance {{ $labels.instance }}).\"\n description: \"The number of conntrack is approching limit.\"\n - alert: HostNetworkInterfaceSaturated\n expr: (rate(node_network_receive_bytes_total{device!~\"^tap.*\"}[1m]) + rate(node_network_transmit_bytes_total{device!~\"^tap.*\"}[1m])) / node_network_speed_bytes{device!~\"^tap.*\"} \u003e 0.8\n for: 1m\n labels:\n severity: warning\n annotations:\n summary: \"Host Network Interface Saturated (instance {{ $labels.instance }}).\"\n description: \"The network interface {{ $labels.interface }} on {{ $labels.instance }} is getting overloaded.\"\n - alert: HostSystemdServiceCrashed\n expr: node_systemd_unit_state{state=\"failed\"} == 1\n for: 0m\n labels:\n severity: warning\n annotations:\n summary: \"Host SystemD service crashed (instance {{ $labels.instance }}).\"\n description: \"SystemD service crashed.\"\n - alert: HostEdacCorrectableErrorsDetected\n expr: increase(node_edac_correctable_errors_total[1m]) \u003e 0\n for: 0m\n labels:\n severity: info\n annotations:\n summary: \"Host EDAC Correctable Errors detected (instance {{ $labels.instance }}).\"\n description: '{{ $labels.instance }} has had {{ printf \"%.0f\" $value }} correctable memory errors reported by EDAC in the last 5 minutes.'\n - alert: HostEdacUncorrectableErrorsDetected\n expr: node_edac_uncorrectable_errors_total \u003e 0\n for: 0m\n labels:\n severity: warning\n annotations:\n summary: \"Host EDAC Uncorrectable Errors detected (instance {{ $labels.instance }}).\"\n description: '{{ $labels.instance }} has had {{ printf \"%.0f\" $value }} uncorrectable memory errors reported by EDAC in the last 5 minutes.'\n- name: \"Min.io\"\n rules:\n - alert: MinioDiskOffline\n expr: minio_offline_disks \u003e 0\n for: 0m\n labels:\n severity: critical\n annotations:\n summary: \"Minio disk offline (instance {{ $labels.instance }})\"\n description: \"Minio disk is offline.\"\n - alert: MinioStorageSpaceExhausted\n expr: minio_disk_storage_free_bytes / 1024 / 1024 / 1024 \u003c 10\n for: 2m\n labels:\n severity: warning\n annotations:\n summary: \"Minio storage space exhausted (instance {{ $labels.instance }}).\"\n description: \"Minio storage space is low (\u003c 10 GB).\"\n- name: \"Prometheus\"\n rules:\n - alert: PrometheusConfigurationReloadFailure\n expr: prometheus_config_last_reload_successful != 1\n for: 0m\n labels:\n severity: warning\n annotations:\n summary: \"Prometheus configuration reload failure (instance {{ $labels.instance }}).\"\n description: \"Prometheus configuration reload error.\"\n - alert: PrometheusTooManyRestarts\n expr: changes(process_start_time_seconds{job=~\"prometheus|pushgateway|alertmanager\"}[15m]) \u003e 2\n for: 0m\n labels:\n severity: warning\n annotations:\n summary: \"Prometheus too many restarts (instance {{ $labels.instance }}).\"\n description: \"Prometheus has restarted more than twice in the last 15 minutes. It might be crashlooping.\"\n - alert: PrometheusAlertmanagerConfigurationReloadFailure\n expr: alertmanager_config_last_reload_successful != 1\n for: 0m\n labels:\n severity: warning\n annotations:\n summary: \"Prometheus AlertManager configuration reload failure (instance {{ $labels.instance }}).\"\n description: \"AlertManager configuration reload error.\"\n - alert: PrometheusRuleEvaluationFailures\n expr: increase(prometheus_rule_evaluation_failures_total[3m]) \u003e 0\n for: 0m\n labels:\n severity: critical\n annotations:\n summary: \"Prometheus rule evaluation failures (instance {{ $labels.instance }}).\"\n description: \"Prometheus encountered {{ $value }} rule evaluation failures, leading to potentially ignored alerts.\"\n - alert: PrometheusTargetScrapingSlow\n expr: prometheus_target_interval_length_seconds{quantile=\"0.9\"} \u003e 60\n for: 5m\n labels:\n severity: warning\n annotations:\n summary: \"Prometheus target scraping slow (instance {{ $labels.instance }}).\"\n description: \"Prometheus is scraping exporters slowly.\"\n - alert: PrometheusTsdbCompactionsFailed\n expr: increase(prometheus_tsdb_compactions_failed_total[1m]) \u003e 0\n for: 0m\n labels:\n severity: critical\n annotations:\n summary: \"Prometheus TSDB compactions failed (instance {{ $labels.instance }}).\"\n description: \"Prometheus encountered {{ $value }} TSDB compactions failures.\"\n - alert: PrometheusTsdbHeadTruncationsFailed\n expr: increase(prometheus_tsdb_head_truncations_failed_total[1m]) \u003e 0\n for: 0m\n labels:\n severity: critical\n annotations:\n summary: \"Prometheus TSDB head truncations failed (instance {{ $labels.instance }}).\"\n description: \"Prometheus encountered {{ $value }} TSDB head truncation failures.\"\n - alert: PrometheusTsdbWalCorruptions\n expr: increase(prometheus_tsdb_wal_corruptions_total[1m]) \u003e 0\n for: 0m\n labels:\n severity: critical\n annotations:\n summary: \"Prometheus TSDB WAL corruptions (instance {{ $labels.instance }}).\"\n description: \"Prometheus encountered {{ $value }} TSDB WAL corruptions.\"\n - alert: PrometheusTsdbWalTruncationsFailed\n expr: increase(prometheus_tsdb_wal_truncations_failed_total[1m]) \u003e 0\n for: 0m\n labels:\n severity: critical\n annotations:\n summary: \"Prometheus TSDB WAL truncations failed (instance {{ $labels.instance }}).\"\n description: \"Prometheus encountered {{ $value }} TSDB WAL truncation failures.\"\nEOH\n }\n\n template {\n change_mode = \"noop\"\n change_signal = \"SIGINT\"\n destination = \"secrets/prometheus.yml\"\n data = \u003c\u003cEOH\n---\nglobal:\n scrape_interval: 5s\n scrape_timeout: 5s\n evaluation_interval: 5s\n\nalerting:\n alertmanagers:\n - consul_sd_configs:\n - server: '{{ env \"NOMAD_IP_prometheus\" }}:8500'\n services: [ 'alertmanager' ]\n\nrule_files:\n - 'alerts.yml'\n\nscrape_configs:\n\n - job_name: 'Nomad Cluster'\n consul_sd_configs:\n - server: '{{ env \"NOMAD_IP_prometheus\" }}:8500'\n services: [ 'nomad-client', 'nomad' ]\n relabel_configs:\n - source_labels: [__meta_consul_tags]\n regex: '(.*)http(.*)'\n action: keep\n metrics_path: /v1/metrics\n params:\n format: [ 'prometheus' ]\n\n - job_name: 'Consul Cluster'\n static_configs:\n - targets: [ '10.30.51.30:8500', '10.30.51.32:8500', '10.30.51.33:8500' ]\n metrics_path: /v1/agent/metrics\n params:\n format: [ 'prometheus' ]\n\n - job_name: 'Alertmanager'\n consul_sd_configs:\n - server: '{{ env \"NOMAD_IP_prometheus\" }}:8500'\n services: [ 'alertmanager' ]\n\n - job_name: 'Blackbox Exporter (icmp)'\n static_configs:\n - targets: [ 'gerrit.fd.io' ]\n - targets: [ 'jenkins.fd.io' ]\n - targets: [ '10.30.51.32' ]\n params:\n module: [ 'icmp_v4' ]\n relabel_configs:\n - source_labels: [__address__]\n target_label: __param_target\n - source_labels: [__param_target]\n target_label: instance\n - target_label: __address__\n replacement: localhost:9115\n metrics_path: /probe\n\n - job_name: 'Blackbox Exporter (http)'\n static_configs:\n - targets: [ 'gerrit.fd.io' ]\n - targets: [ 'jenkins.fd.io' ]\n params:\n module: [ 'http_2xx' ]\n relabel_configs:\n - source_labels: [__address__]\n target_label: __param_target\n - source_labels: [__param_target]\n target_label: instance\n - target_label: __address__\n replacement: localhost:9115\n metrics_path: /probe\n\n - job_name: 'cAdvisor Exporter'\n consul_sd_configs:\n - server: '{{ env \"NOMAD_IP_prometheus\" }}:8500'\n services: [ 'cadvisorexporter' ]\n\n - job_name: 'Grafana'\n consul_sd_configs:\n - server: '{{ env \"NOMAD_IP_prometheus\" }}:8500'\n services: [ 'grafana' ]\n\n - job_name: 'Node Exporter'\n consul_sd_configs:\n - server: '{{ env \"NOMAD_IP_prometheus\" }}:8500'\n services: [ 'nodeexporter' ]\n\n - job_name: 'Prometheus'\n consul_sd_configs:\n - server: '{{ env \"NOMAD_IP_prometheus\" }}:8500'\n services: [ 'prometheus' ]\n\n - job_name: 'Minio'\n bearer_token: eyJhbGciOiJIUzUxMiIsInR5cCI6IkpXVCJ9.eyJleHAiOjQ3NjQ1ODEzMzcsImlzcyI6InByb21ldGhldXMiLCJzdWIiOiJtaW5pbyJ9.oeTw3EIaiFmlDikrHXWiWXMH2vxLfDLkfjEC7G2N3M_keH_xyA_l2ofLLNYtopa_3GCEZnxLQdPuFZrmgpkDWg\n consul_sd_configs:\n - server: '{{ env \"NOMAD_IP_prometheus\" }}:8500'\n services: [ 'storage' ]\n metrics_path: /minio/prometheus/metrics\nEOH\n }\n\n # The service stanza instructs Nomad to register a service with Consul.\n #\n # For more information and examples on the \"task\" stanza, please see\n # the online documentation at:\n #\n # https://www.nomadproject.io/docs/job-specification/service\n #\n service {\n name = \"prometheus\"\n port = \"prometheus\"\n tags = [ \"prometheus${NOMAD_ALLOC_INDEX}\" ]\n check {\n name = \"Prometheus Check Live\"\n type = \"http\"\n path = \"/-/healthy\"\n interval = \"10s\"\n timeout = \"2s\"\n }\n }\n\n # The \"resources\" stanza describes the requirements a task needs to\n # execute. Resource requirements include memory, network, cpu, and more.\n # This ensures the task will execute on a machine that contains enough\n # resource capacity.\n #\n # For more information and examples on the \"resources\" stanza, please see\n # the online documentation at:\n #\n # https://www.nomadproject.io/docs/job-specification/resources\n #\n resources {\n cpu = 2000\n memory = 8192\n # The network stanza specifies the networking requirements for the task\n # group, including the network mode and port allocations. When scheduling\n # jobs in Nomad they are provisioned across your fleet of machines along\n # with other jobs and services. Because you don't know in advance what host\n # your job will be provisioned on, Nomad will provide your tasks with\n # network configuration when they start up.\n #\n # For more information and examples on the \"template\" stanza, please see\n # the online documentation at:\n #\n # https://www.nomadproject.io/docs/job-specification/network\n #\n network {\n port \"prometheus\" {\n static = 9090\n }\n }\n }\n }\n }\n}",
+ "json": null,
+ "modify_index": "7147932",
+ "name": "prod-prometheus",
+ "namespace": "default",
+ "policy_override": null,
+ "purge_on_destroy": null,
+ "region": "global",
+ "task_groups": [
+ {
+ "count": 4,
+ "meta": {},
+ "name": "prod-group1-prometheus",
+ "task": [
+ {
+ "driver": "exec",
+ "meta": {},
+ "name": "prod-task1-prometheus",
+ "volume_mounts": [
+ {
+ "destination": "/data/",
+ "read_only": false,
+ "volume": "prod-volume1-prometheus"
+ }
+ ]
+ }
+ ],
+ "volumes": [
+ {
+ "name": "prod-volume1-prometheus",
+ "read_only": false,
+ "source": "prod-volume-data1-1",
+ "type": "host"
+ }
+ ]
+ }
+ ],
+ "type": "service"
+ },
+ "sensitive_attributes": [],
+ "private": "bnVsbA==",
+ "dependencies": [
+ "module.prometheus.data.template_file.nomad_job_prometheus"
+ ]
+ }
+ ]
+ },
+ {
+ "module": "module.vpp_device",
+ "mode": "data",
+ "type": "template_file",
+ "name": "nomad_job_csit_shim",
+ "provider": "provider[\"registry.terraform.io/hashicorp/template\"]",
+ "instances": [
+ {
+ "schema_version": 0,
+ "attributes": {
+ "filename": null,
+ "id": "a285223159ce0af9e5427d857021e5651c09408af90decd3b93a90efd4fd7ce8",
+ "rendered": "job \"prod-device-csit-shim\" {\n # The \"region\" parameter specifies the region in which to execute the job.\n # If omitted, this inherits the default region name of \"global\".\n # region = \"global\"\n #\n # The \"datacenters\" parameter specifies the list of datacenters which should\n # be considered when placing this task. This must be provided.\n datacenters = \"yul1\"\n\n # The \"type\" parameter controls the type of job, which impacts the scheduler's\n # decision on placement. This configuration is optional and defaults to\n # \"service\". For a full list of job types and their differences, please see\n # the online documentation.\n #\n # For more information, please see the online documentation at:\n #\n # https://www.nomadproject.io/docs/jobspec/schedulers.html\n #\n type = \"system\"\n\n # The \"group\" stanza defines a series of tasks that should be co-located on\n # the same Nomad client. Any task within a group will be placed on the same\n # client.\n #\n # For more information and examples on the \"group\" stanza, please see\n # the online documentation at:\n #\n # https://www.nomadproject.io/docs/job-specification/group.html\n #\n group \"prod-group1-csit-shim-amd\" {\n # The \"count\" parameter specifies the number of the task groups that should\n # be running under this group. This value must be non-negative and defaults\n # to 1.\n count = 1\n\n constraint {\n attribute = \"${node.class}\"\n value = \"csit\"\n }\n\n restart {\n interval = \"1m\"\n attempts = 3\n delay = \"15s\"\n mode = \"delay\"\n }\n\n # The \"task\" stanza creates an individual unit of work, such as a Docker\n # container, web application, or batch processing.\n #\n # For more information and examples on the \"task\" stanza, please see\n # the online documentation at:\n #\n # https://www.nomadproject.io/docs/job-specification/task.html\n #\n task \"prod-task1-csit-shim-amd\" {\n # The \"driver\" parameter specifies the task driver that should be used to\n # run the task.\n driver = \"docker\"\n\n # The \"config\" stanza specifies the driver configuration, which is passed\n # directly to the driver to start the task. The details of configurations\n # are specific to each driver, so please see specific driver\n # documentation for more information.\n config {\n image = \"csit_shim-ubuntu1804:local\"\n network_mode = \"host\"\n pid_mode = \"host\"\n volumes = [\n \"/var/run/docker.sock:/var/run/docker.sock\"\n ]\n privileged = true\n }\n\n # The \"resources\" stanza describes the requirements a task needs to\n # execute. Resource requirements include memory, network, cpu, and more.\n # This ensures the task will execute on a machine that contains enough\n # resource capacity.\n #\n # For more information and examples on the \"resources\" stanza, please see\n # the online documentation at:\n #\n # https://www.nomadproject.io/docs/job-specification/resources.html\n #\n resources {\n cpu = 1000\n memory = 5000\n network {\n port \"ssh\" {\n static = 6022\n }\n port \"ssh2\" {\n static = 6023\n }\n }\n }\n }\n }\n\n group \"prod-group1-csit-shim-arm\" {\n # The \"count\" parameter specifies the number of the task groups that should\n # be running under this group. This value must be non-negative and defaults\n # to 1.\n count = 1\n\n constraint {\n attribute = \"${node.class}\"\n value = \"csitarm\"\n }\n\n restart {\n interval = \"1m\"\n attempts = 3\n delay = \"15s\"\n mode = \"delay\"\n }\n\n # The \"task\" stanza creates an individual unit of work, such as a Docker\n # container, web application, or batch processing.\n #\n # For more information and examples on the \"task\" stanza, please see\n # the online documentation at:\n #\n # https://www.nomadproject.io/docs/job-specification/task.html\n #\n task \"prod-task1-csit-shim-arm\" {\n # The \"driver\" parameter specifies the task driver that should be used to\n # run the task.\n driver = \"docker\"\n\n # The \"config\" stanza specifies the driver configuration, which is passed\n # directly to the driver to start the task. The details of configurations\n # are specific to each driver, so please see specific driver\n # documentation for more information.\n config {\n image = \"csit_shim-ubuntu1804:local\"\n network_mode = \"host\"\n pid_mode = \"host\"\n volumes = [\n \"/var/run/docker.sock:/var/run/docker.sock\"\n ]\n privileged = true\n }\n\n # The \"resources\" stanza describes the requirements a task needs to\n # execute. Resource requirements include memory, network, cpu, and more.\n # This ensures the task will execute on a machine that contains enough\n # resource capacity.\n #\n # For more information and examples on the \"resources\" stanza, please see\n # the online documentation at:\n #\n # https://www.nomadproject.io/docs/job-specification/resources.html\n #\n resources {\n cpu = 1000\n memory = 5000\n network {\n port \"ssh\" {\n static = 6022\n }\n port \"ssh2\" {\n static = 6023\n }\n }\n }\n }\n }\n}",
+ "template": "job \"${job_name}\" {\n # The \"region\" parameter specifies the region in which to execute the job.\n # If omitted, this inherits the default region name of \"global\".\n # region = \"global\"\n #\n # The \"datacenters\" parameter specifies the list of datacenters which should\n # be considered when placing this task. This must be provided.\n datacenters = \"${datacenters}\"\n\n # The \"type\" parameter controls the type of job, which impacts the scheduler's\n # decision on placement. This configuration is optional and defaults to\n # \"service\". For a full list of job types and their differences, please see\n # the online documentation.\n #\n # For more information, please see the online documentation at:\n #\n # https://www.nomadproject.io/docs/jobspec/schedulers.html\n #\n type = \"system\"\n\n # The \"group\" stanza defines a series of tasks that should be co-located on\n # the same Nomad client. Any task within a group will be placed on the same\n # client.\n #\n # For more information and examples on the \"group\" stanza, please see\n # the online documentation at:\n #\n # https://www.nomadproject.io/docs/job-specification/group.html\n #\n group \"prod-group1-csit-shim-amd\" {\n # The \"count\" parameter specifies the number of the task groups that should\n # be running under this group. This value must be non-negative and defaults\n # to 1.\n count = ${group_count}\n\n constraint {\n attribute = \"$${node.class}\"\n value = \"csit\"\n }\n\n restart {\n interval = \"1m\"\n attempts = 3\n delay = \"15s\"\n mode = \"delay\"\n }\n\n # The \"task\" stanza creates an individual unit of work, such as a Docker\n # container, web application, or batch processing.\n #\n # For more information and examples on the \"task\" stanza, please see\n # the online documentation at:\n #\n # https://www.nomadproject.io/docs/job-specification/task.html\n #\n task \"prod-task1-csit-shim-amd\" {\n # The \"driver\" parameter specifies the task driver that should be used to\n # run the task.\n driver = \"docker\"\n\n # The \"config\" stanza specifies the driver configuration, which is passed\n # directly to the driver to start the task. The details of configurations\n # are specific to each driver, so please see specific driver\n # documentation for more information.\n config {\n image = \"csit_shim-ubuntu1804:local\"\n network_mode = \"host\"\n pid_mode = \"host\"\n volumes = [\n \"/var/run/docker.sock:/var/run/docker.sock\"\n ]\n privileged = true\n }\n\n # The \"resources\" stanza describes the requirements a task needs to\n # execute. Resource requirements include memory, network, cpu, and more.\n # This ensures the task will execute on a machine that contains enough\n # resource capacity.\n #\n # For more information and examples on the \"resources\" stanza, please see\n # the online documentation at:\n #\n # https://www.nomadproject.io/docs/job-specification/resources.html\n #\n resources {\n cpu = ${cpu}\n memory = ${mem}\n network {\n port \"ssh\" {\n static = 6022\n }\n port \"ssh2\" {\n static = 6023\n }\n }\n }\n }\n }\n\n group \"prod-group1-csit-shim-arm\" {\n # The \"count\" parameter specifies the number of the task groups that should\n # be running under this group. This value must be non-negative and defaults\n # to 1.\n count = ${group_count}\n\n constraint {\n attribute = \"$${node.class}\"\n value = \"csitarm\"\n }\n\n restart {\n interval = \"1m\"\n attempts = 3\n delay = \"15s\"\n mode = \"delay\"\n }\n\n # The \"task\" stanza creates an individual unit of work, such as a Docker\n # container, web application, or batch processing.\n #\n # For more information and examples on the \"task\" stanza, please see\n # the online documentation at:\n #\n # https://www.nomadproject.io/docs/job-specification/task.html\n #\n task \"prod-task1-csit-shim-arm\" {\n # The \"driver\" parameter specifies the task driver that should be used to\n # run the task.\n driver = \"docker\"\n\n # The \"config\" stanza specifies the driver configuration, which is passed\n # directly to the driver to start the task. The details of configurations\n # are specific to each driver, so please see specific driver\n # documentation for more information.\n config {\n image = \"csit_shim-ubuntu1804:local\"\n network_mode = \"host\"\n pid_mode = \"host\"\n volumes = [\n \"/var/run/docker.sock:/var/run/docker.sock\"\n ]\n privileged = true\n }\n\n # The \"resources\" stanza describes the requirements a task needs to\n # execute. Resource requirements include memory, network, cpu, and more.\n # This ensures the task will execute on a machine that contains enough\n # resource capacity.\n #\n # For more information and examples on the \"resources\" stanza, please see\n # the online documentation at:\n #\n # https://www.nomadproject.io/docs/job-specification/resources.html\n #\n resources {\n cpu = ${cpu}\n memory = ${mem}\n network {\n port \"ssh\" {\n static = 6022\n }\n port \"ssh2\" {\n static = 6023\n }\n }\n }\n }\n }\n}",
+ "vars": {
+ "cpu": "1000",
+ "datacenters": "yul1",
+ "group_count": "1",
+ "job_name": "prod-device-csit-shim",
+ "mem": "5000"
+ }
+ },
+ "sensitive_attributes": []
+ }
+ ]
+ },
+ {
+ "module": "module.vpp_device",
+ "mode": "managed",
+ "type": "nomad_job",
+ "name": "nomad_job_csit_shim",
+ "provider": "provider[\"registry.terraform.io/hashicorp/nomad\"].yul1",
+ "instances": [
+ {
+ "schema_version": 0,
+ "attributes": {
+ "allocation_ids": [
+ "ecfcd9ad-b959-0fa4-c1ec-8b82195830f1",
+ "190f5699-0d10-7f85-ac68-96927702070b",
+ "2fbbef45-f00a-1367-08c4-c2239de9e78d",
+ "dd57bc83-2229-d242-1caf-b743f186e7a2"
+ ],
+ "datacenters": [
+ "yul1"
+ ],
+ "deployment_id": "",
+ "deployment_status": "",
+ "deregister_on_destroy": true,
+ "deregister_on_id_change": true,
+ "detach": false,
+ "id": "prod-device-csit-shim",
+ "jobspec": "job \"prod-device-csit-shim\" {\n # The \"region\" parameter specifies the region in which to execute the job.\n # If omitted, this inherits the default region name of \"global\".\n # region = \"global\"\n #\n # The \"datacenters\" parameter specifies the list of datacenters which should\n # be considered when placing this task. This must be provided.\n datacenters = \"yul1\"\n\n # The \"type\" parameter controls the type of job, which impacts the scheduler's\n # decision on placement. This configuration is optional and defaults to\n # \"service\". For a full list of job types and their differences, please see\n # the online documentation.\n #\n # For more information, please see the online documentation at:\n #\n # https://www.nomadproject.io/docs/jobspec/schedulers.html\n #\n type = \"system\"\n\n # The \"group\" stanza defines a series of tasks that should be co-located on\n # the same Nomad client. Any task within a group will be placed on the same\n # client.\n #\n # For more information and examples on the \"group\" stanza, please see\n # the online documentation at:\n #\n # https://www.nomadproject.io/docs/job-specification/group.html\n #\n group \"prod-group1-csit-shim-amd\" {\n # The \"count\" parameter specifies the number of the task groups that should\n # be running under this group. This value must be non-negative and defaults\n # to 1.\n count = 1\n\n constraint {\n attribute = \"${node.class}\"\n value = \"csit\"\n }\n\n restart {\n interval = \"1m\"\n attempts = 3\n delay = \"15s\"\n mode = \"delay\"\n }\n\n # The \"task\" stanza creates an individual unit of work, such as a Docker\n # container, web application, or batch processing.\n #\n # For more information and examples on the \"task\" stanza, please see\n # the online documentation at:\n #\n # https://www.nomadproject.io/docs/job-specification/task.html\n #\n task \"prod-task1-csit-shim-amd\" {\n # The \"driver\" parameter specifies the task driver that should be used to\n # run the task.\n driver = \"docker\"\n\n # The \"config\" stanza specifies the driver configuration, which is passed\n # directly to the driver to start the task. The details of configurations\n # are specific to each driver, so please see specific driver\n # documentation for more information.\n config {\n image = \"csit_shim-ubuntu1804:local\"\n network_mode = \"host\"\n pid_mode = \"host\"\n volumes = [\n \"/var/run/docker.sock:/var/run/docker.sock\"\n ]\n privileged = true\n }\n\n # The \"resources\" stanza describes the requirements a task needs to\n # execute. Resource requirements include memory, network, cpu, and more.\n # This ensures the task will execute on a machine that contains enough\n # resource capacity.\n #\n # For more information and examples on the \"resources\" stanza, please see\n # the online documentation at:\n #\n # https://www.nomadproject.io/docs/job-specification/resources.html\n #\n resources {\n cpu = 1000\n memory = 5000\n network {\n port \"ssh\" {\n static = 6022\n }\n port \"ssh2\" {\n static = 6023\n }\n }\n }\n }\n }\n\n group \"prod-group1-csit-shim-arm\" {\n # The \"count\" parameter specifies the number of the task groups that should\n # be running under this group. This value must be non-negative and defaults\n # to 1.\n count = 1\n\n constraint {\n attribute = \"${node.class}\"\n value = \"csitarm\"\n }\n\n restart {\n interval = \"1m\"\n attempts = 3\n delay = \"15s\"\n mode = \"delay\"\n }\n\n # The \"task\" stanza creates an individual unit of work, such as a Docker\n # container, web application, or batch processing.\n #\n # For more information and examples on the \"task\" stanza, please see\n # the online documentation at:\n #\n # https://www.nomadproject.io/docs/job-specification/task.html\n #\n task \"prod-task1-csit-shim-arm\" {\n # The \"driver\" parameter specifies the task driver that should be used to\n # run the task.\n driver = \"docker\"\n\n # The \"config\" stanza specifies the driver configuration, which is passed\n # directly to the driver to start the task. The details of configurations\n # are specific to each driver, so please see specific driver\n # documentation for more information.\n config {\n image = \"csit_shim-ubuntu1804:local\"\n network_mode = \"host\"\n pid_mode = \"host\"\n volumes = [\n \"/var/run/docker.sock:/var/run/docker.sock\"\n ]\n privileged = true\n }\n\n # The \"resources\" stanza describes the requirements a task needs to\n # execute. Resource requirements include memory, network, cpu, and more.\n # This ensures the task will execute on a machine that contains enough\n # resource capacity.\n #\n # For more information and examples on the \"resources\" stanza, please see\n # the online documentation at:\n #\n # https://www.nomadproject.io/docs/job-specification/resources.html\n #\n resources {\n cpu = 1000\n memory = 5000\n network {\n port \"ssh\" {\n static = 6022\n }\n port \"ssh2\" {\n static = 6023\n }\n }\n }\n }\n }\n}",
+ "json": null,
+ "modify_index": "7077736",
+ "name": "prod-device-csit-shim",
+ "namespace": "default",
+ "policy_override": null,
+ "purge_on_destroy": null,
+ "region": "global",
+ "task_groups": [
+ {
+ "count": 1,
+ "meta": {},
+ "name": "prod-group1-csit-shim-amd",
+ "task": [
+ {
+ "driver": "docker",
+ "meta": {},
+ "name": "prod-task1-csit-shim-amd",
+ "volume_mounts": []
+ }
+ ],
+ "volumes": []
+ },
+ {
+ "count": 1,
+ "meta": {},
+ "name": "prod-group1-csit-shim-arm",
+ "task": [
+ {
+ "driver": "docker",
+ "meta": {},
+ "name": "prod-task1-csit-shim-arm",
+ "volume_mounts": []
+ }
+ ],
+ "volumes": []
+ }
+ ],
+ "type": "system"
+ },
+ "sensitive_attributes": [],
+ "private": "bnVsbA==",
+ "dependencies": [
+ "module.vpp_device.data.template_file.nomad_job_csit_shim"
+ ]
+ }
+ ]
+ }
+ ]
+}
+>>>>>>> CHANGE (a44eef Infra: Monitoring capability)
diff --git a/terraform-ci-infra/1n_nmd/terraform.tfstate.backup b/terraform-ci-infra/1n_nmd/terraform.tfstate.backup
new file mode 100644
index 0000000000..ba12fe81dd
--- /dev/null
+++ b/terraform-ci-infra/1n_nmd/terraform.tfstate.backup
@@ -0,0 +1,831 @@
+<<<<<<< HEAD (dae934 Infra: Remove Consul TLS on clients (Nomad conflict))
+=======
+{
+ "version": 4,
+ "terraform_version": "0.14.5",
+ "serial": 1026,
+ "lineage": "e4e7f30a-652d-7a31-e31c-5e3a3388c9b9",
+ "outputs": {},
+ "resources": [
+ {
+ "module": "module.alertmanager",
+ "mode": "data",
+ "type": "template_file",
+ "name": "nomad_job_alertmanager",
+ "provider": "provider[\"registry.terraform.io/hashicorp/template\"]",
+ "instances": [
+ {
+ "schema_version": 0,
+ "attributes": {
+ "filename": null,
+ "id": "ae09d392c808970a05f2423cd8dbf158f6cb6d3ae50260cd7d16f1575ce43871",
+ "rendered": "job \"prod-alertmanager\" {\n # The \"region\" parameter specifies the region in which to execute the job.\n # If omitted, this inherits the default region name of \"global\".\n # region = \"global\"\n #\n # The \"datacenters\" parameter specifies the list of datacenters which should\n # be considered when placing this task. This must be provided.\n datacenters = \"yul1\"\n\n # The \"type\" parameter controls the type of job, which impacts the scheduler's\n # decision on placement. This configuration is optional and defaults to\n # \"service\". For a full list of job types and their differences, please see\n # the online documentation.\n #\n # For more information, please see the online documentation at:\n #\n # https://www.nomadproject.io/docs/jobspec/schedulers\n #\n type = \"service\"\n\n update {\n # The \"max_parallel\" parameter specifies the maximum number of updates to\n # perform in parallel. In this case, this specifies to update a single task\n # at a time.\n max_parallel = 1\n\n health_check = \"checks\"\n\n # The \"min_healthy_time\" parameter specifies the minimum time the allocation\n # must be in the healthy state before it is marked as healthy and unblocks\n # further allocations from being updated.\n min_healthy_time = \"10s\"\n\n # The \"healthy_deadline\" parameter specifies the deadline in which the\n # allocation must be marked as healthy after which the allocation is\n # automatically transitioned to unhealthy. Transitioning to unhealthy will\n # fail the deployment and potentially roll back the job if \"auto_revert\" is\n # set to true.\n healthy_deadline = \"3m\"\n\n # The \"progress_deadline\" parameter specifies the deadline in which an\n # allocation must be marked as healthy. The deadline begins when the first\n # allocation for the deployment is created and is reset whenever an allocation\n # as part of the deployment transitions to a healthy state. If no allocation\n # transitions to the healthy state before the progress deadline, the\n # deployment is marked as failed.\n progress_deadline = \"10m\"\n\n\n # The \"canary\" parameter specifies that changes to the job that would result\n # in destructive updates should create the specified number of canaries\n # without stopping any previous allocations. Once the operator determines the\n # canaries are healthy, they can be promoted which unblocks a rolling update\n # of the remaining allocations at a rate of \"max_parallel\".\n #\n # Further, setting \"canary\" equal to the count of the task group allows\n # blue/green deployments. When the job is updated, a full set of the new\n # version is deployed and upon promotion the old version is stopped.\n canary = 1\n\n # Specifies if the job should auto-promote to the canary version when all\n # canaries become healthy during a deployment. Defaults to false which means\n # canaries must be manually updated with the nomad deployment promote\n # command.\n auto_promote = true\n\n # The \"auto_revert\" parameter specifies if the job should auto-revert to the\n # last stable job on deployment failure. A job is marked as stable if all the\n # allocations as part of its deployment were marked healthy.\n auto_revert = true\n\n }\n\n # The \"group\" stanza defines a series of tasks that should be co-located on\n # the same Nomad client. Any task within a group will be placed on the same\n # client.\n #\n # For more information and examples on the \"group\" stanza, please see\n # the online documentation at:\n #\n # https://www.nomadproject.io/docs/job-specification/group\n #\n group \"prod-group1-alertmanager\" {\n # The \"count\" parameter specifies the number of the task groups that should\n # be running under this group. This value must be non-negative and defaults\n # to 1.\n count = 1\n\n # The constraint allows restricting the set of eligible nodes. Constraints\n # may filter on attributes or client metadata.\n #\n # For more information and examples on the \"volume\" stanza, please see\n # the online documentation at:\n #\n # https://www.nomadproject.io/docs/job-specification/constraint\n #\n constraint {\n attribute = \"${attr.cpu.arch}\"\n operator = \"!=\"\n value = \"arm64\"\n }\n\n # The \"task\" stanza creates an individual unit of work, such as a Docker\n # container, web application, or batch processing.\n #\n # For more information and examples on the \"task\" stanza, please see\n # the online documentation at:\n #\n # https://www.nomadproject.io/docs/job-specification/task\n #\n task \"prod-task1-alertmanager\" {\n # The \"driver\" parameter specifies the task driver that should be used to\n # run the task.\n driver = \"exec\"\n\n \n\n # The \"config\" stanza specifies the driver configuration, which is passed\n # directly to the driver to start the task. The details of configurations\n # are specific to each driver, so please see specific driver\n # documentation for more information.\n config {\n command = \"local/alertmanager-0.21.0.linux-amd64/alertmanager\"\n args = [\n \"--config.file=secrets/alertmanager.yml\"\n ]\n }\n\n # The artifact stanza instructs Nomad to fetch and unpack a remote resource,\n # such as a file, tarball, or binary. Nomad downloads artifacts using the\n # popular go-getter library, which permits downloading artifacts from a\n # variety of locations using a URL as the input source.\n #\n # For more information and examples on the \"artifact\" stanza, please see\n # the online documentation at:\n #\n # https://www.nomadproject.io/docs/job-specification/artifact\n #\n artifact {\n source = \"https://github.com/prometheus/alertmanager/releases/download/v0.21.0/alertmanager-0.21.0.linux-amd64.tar.gz\"\n }\n\n # The \"template\" stanza instructs Nomad to manage a template, such as\n # a configuration file or script. This template can optionally pull data\n # from Consul or Vault to populate runtime configuration data.\n #\n # For more information and examples on the \"template\" stanza, please see\n # the online documentation at:\n #\n # https://www.nomadproject.io/docs/job-specification/template\n #\n template {\n change_mode = \"noop\"\n change_signal = \"SIGINT\"\n destination = \"secrets/alertmanager.yml\"\n left_delimiter = \"{{{\"\n right_delimiter = \"}}}\"\n data = \u003c\u003cEOH\nglobal:\n # The API URL to use for Slack notifications.\n slack_api_url: 'https://hooks.slack.com/services/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'\n\n# The directory from which notification templates are read.\ntemplates:\n- '/etc/alertmanager/template/*.tmpl'\n\n#tls_config:\n# # CA certificate to validate the server certificate with.\n# ca_file: \u003cfilepath\u003e ]\n#\n# # Certificate and key files for client cert authentication to the server.\n# cert_file: \u003cfilepath\u003e\n# key_file: \u003cfilepath\u003e\n#\n# # ServerName extension to indicate the name of the server.\n# # http://tools.ietf.org/html/rfc4366#section-3.1\n# server_name: \u003cstring\u003e\n#\n# # Disable validation of the server certificate.\n# insecure_skip_verify: true\n\n# The root route on which each incoming alert enters.\nroute:\n receiver: 'default-receiver'\n\n # The labels by which incoming alerts are grouped together. For example,\n # multiple alerts coming in for cluster=A and alertname=LatencyHigh would\n # be batched into a single group.\n #\n # To aggregate by all possible labels use '...' as the sole label name.\n # This effectively disables aggregation entirely, passing through all\n # alerts as-is. This is unlikely to be what you want, unless you have\n # a very low alert volume or your upstream notification system performs\n # its own grouping. Example: group_by: [...]\n group_by: ['alertname', 'cluster', 'service']\n\n # When a new group of alerts is created by an incoming alert, wait at\n # least 'group_wait' to send the initial notification.\n # This way ensures that you get multiple alerts for the same group that start\n # firing shortly after another are batched together on the first\n # notification.\n group_wait: 30s\n\n # When the first notification was sent, wait 'group_interval' to send a batch\n # of new alerts that started firing for that group.\n group_interval: 5m\n\n # If an alert has successfully been sent, wait 'repeat_interval' to\n # resend them.\n repeat_interval: 3h\n\n # All the above attributes are inherited by all child routes and can\n # overwritten on each.\n # The child route trees.\n routes:\n # This routes performs a regular expression match on alert labels to\n # catch alerts that are related to a list of services.\n - match_re:\n service: .*\n receiver: default-receiver\n # The service has a sub-route for critical alerts, any alerts\n # that do not match, i.e. severity != critical, fall-back to the\n # parent node and are sent to 'team-X-mails'\n routes:\n - match:\n severity: critical\n receiver: 'default-receiver'\n\n# Inhibition rules allow to mute a set of alerts given that another alert is\n# firing.\n# We use this to mute any warning-level notifications if the same alert is\n# already critical.\ninhibit_rules:\n- source_match:\n severity: 'critical'\n target_match:\n severity: 'warning'\n # Apply inhibition if the alertname is the same.\n # CAUTION:\n # If all label names listed in `equal` are missing\n # from both the source and target alerts,\n # the inhibition rule will apply!\n equal: ['alertname', 'cluster', 'service']\n\nreceivers:\n- name: 'default-receiver'\n slack_configs:\n - channel: '#fdio-infra-monitoring'\n send_resolved: true\n icon_url: https://avatars3.githubusercontent.com/u/3380462\n title: |-\n [{{ .Status | toUpper }}{{ if eq .Status \"firing\" }}:{{ .Alerts.Firing | len }}{{ end }}] {{ .CommonLabels.alertname }} for {{ .CommonLabels.job }}\n {{- if gt (len .CommonLabels) (len .GroupLabels) -}}\n {{\" \"}}(\n {{- with .CommonLabels.Remove .GroupLabels.Names }}\n {{- range $index, $label := .SortedPairs -}}\n {{ if $index }}, {{ end }}\n {{- $label.Name }}=\"{{ $label.Value -}}\"\n {{- end }}\n {{- end -}}\n )\n {{- end }}\n text: \u003e-\n {{ range .Alerts -}}\n *Alert:* {{ .Annotations.summary }}{{ if .Labels.severity }} - `{{ .Labels.severity }}`{{ end }}\n\n *Description:* {{ .Annotations.description }}\n\n *Details:*\n {{ range .Labels.SortedPairs }} • *{{ .Name }}:* `{{ .Value }}`\n {{ end }}\n {{ end }}\nEOH\n }\n\n # The service stanza instructs Nomad to register a service with Consul.\n #\n # For more information and examples on the \"task\" stanza, please see\n # the online documentation at:\n #\n # https://www.nomadproject.io/docs/job-specification/service\n #\n service {\n name = \"alertmanager\"\n port = \"alertmanager\"\n tags = [ \"alertmanager${NOMAD_ALLOC_INDEX}\" ]\n check {\n name = \"Alertmanager Check Live\"\n type = \"http\"\n path = \"/-/healthy\"\n interval = \"10s\"\n timeout = \"2s\"\n }\n }\n\n # The \"resources\" stanza describes the requirements a task needs to\n # execute. Resource requirements include memory, network, cpu, and more.\n # This ensures the task will execute on a machine that contains enough\n # resource capacity.\n #\n # For more information and examples on the \"resources\" stanza, please see\n # the online documentation at:\n #\n # https://www.nomadproject.io/docs/job-specification/resources\n #\n resources {\n cpu = 1000\n memory = 1024\n # The network stanza specifies the networking requirements for the task\n # group, including the network mode and port allocations. When scheduling\n # jobs in Nomad they are provisioned across your fleet of machines along\n # with other jobs and services. Because you don't know in advance what host\n # your job will be provisioned on, Nomad will provide your tasks with\n # network configuration when they start up.\n #\n # For more information and examples on the \"template\" stanza, please see\n # the online documentation at:\n #\n # https://www.nomadproject.io/docs/job-specification/network\n #\n network {\n port \"alertmanager\" {\n static = 9093\n }\n }\n }\n }\n }\n}",
+ "template": "job \"${job_name}\" {\n # The \"region\" parameter specifies the region in which to execute the job.\n # If omitted, this inherits the default region name of \"global\".\n # region = \"global\"\n #\n # The \"datacenters\" parameter specifies the list of datacenters which should\n # be considered when placing this task. This must be provided.\n datacenters = \"${datacenters}\"\n\n # The \"type\" parameter controls the type of job, which impacts the scheduler's\n # decision on placement. This configuration is optional and defaults to\n # \"service\". For a full list of job types and their differences, please see\n # the online documentation.\n #\n # For more information, please see the online documentation at:\n #\n # https://www.nomadproject.io/docs/jobspec/schedulers\n #\n type = \"service\"\n\n update {\n # The \"max_parallel\" parameter specifies the maximum number of updates to\n # perform in parallel. In this case, this specifies to update a single task\n # at a time.\n max_parallel = 1\n\n health_check = \"checks\"\n\n # The \"min_healthy_time\" parameter specifies the minimum time the allocation\n # must be in the healthy state before it is marked as healthy and unblocks\n # further allocations from being updated.\n min_healthy_time = \"10s\"\n\n # The \"healthy_deadline\" parameter specifies the deadline in which the\n # allocation must be marked as healthy after which the allocation is\n # automatically transitioned to unhealthy. Transitioning to unhealthy will\n # fail the deployment and potentially roll back the job if \"auto_revert\" is\n # set to true.\n healthy_deadline = \"3m\"\n\n # The \"progress_deadline\" parameter specifies the deadline in which an\n # allocation must be marked as healthy. The deadline begins when the first\n # allocation for the deployment is created and is reset whenever an allocation\n # as part of the deployment transitions to a healthy state. If no allocation\n # transitions to the healthy state before the progress deadline, the\n # deployment is marked as failed.\n progress_deadline = \"10m\"\n\n%{ if use_canary }\n # The \"canary\" parameter specifies that changes to the job that would result\n # in destructive updates should create the specified number of canaries\n # without stopping any previous allocations. Once the operator determines the\n # canaries are healthy, they can be promoted which unblocks a rolling update\n # of the remaining allocations at a rate of \"max_parallel\".\n #\n # Further, setting \"canary\" equal to the count of the task group allows\n # blue/green deployments. When the job is updated, a full set of the new\n # version is deployed and upon promotion the old version is stopped.\n canary = 1\n\n # Specifies if the job should auto-promote to the canary version when all\n # canaries become healthy during a deployment. Defaults to false which means\n # canaries must be manually updated with the nomad deployment promote\n # command.\n auto_promote = true\n\n # The \"auto_revert\" parameter specifies if the job should auto-revert to the\n # last stable job on deployment failure. A job is marked as stable if all the\n # allocations as part of its deployment were marked healthy.\n auto_revert = true\n%{ endif }\n }\n\n # The \"group\" stanza defines a series of tasks that should be co-located on\n # the same Nomad client. Any task within a group will be placed on the same\n # client.\n #\n # For more information and examples on the \"group\" stanza, please see\n # the online documentation at:\n #\n # https://www.nomadproject.io/docs/job-specification/group\n #\n group \"prod-group1-${service_name}\" {\n # The \"count\" parameter specifies the number of the task groups that should\n # be running under this group. This value must be non-negative and defaults\n # to 1.\n count = ${group_count}\n\n # The constraint allows restricting the set of eligible nodes. Constraints\n # may filter on attributes or client metadata.\n #\n # For more information and examples on the \"volume\" stanza, please see\n # the online documentation at:\n #\n # https://www.nomadproject.io/docs/job-specification/constraint\n #\n constraint {\n attribute = \"$${attr.cpu.arch}\"\n operator = \"!=\"\n value = \"arm64\"\n }\n\n # The \"task\" stanza creates an individual unit of work, such as a Docker\n # container, web application, or batch processing.\n #\n # For more information and examples on the \"task\" stanza, please see\n # the online documentation at:\n #\n # https://www.nomadproject.io/docs/job-specification/task\n #\n task \"prod-task1-${service_name}\" {\n # The \"driver\" parameter specifies the task driver that should be used to\n # run the task.\n driver = \"exec\"\n\n %{ if use_vault_provider }\n vault {\n policies = \"${vault_kv_policy_name}\"\n }\n %{ endif }\n\n # The \"config\" stanza specifies the driver configuration, which is passed\n # directly to the driver to start the task. The details of configurations\n # are specific to each driver, so please see specific driver\n # documentation for more information.\n config {\n command = \"local/alertmanager-${version}.linux-amd64/alertmanager\"\n args = [\n \"--config.file=secrets/alertmanager.yml\"\n ]\n }\n\n # The artifact stanza instructs Nomad to fetch and unpack a remote resource,\n # such as a file, tarball, or binary. Nomad downloads artifacts using the\n # popular go-getter library, which permits downloading artifacts from a\n # variety of locations using a URL as the input source.\n #\n # For more information and examples on the \"artifact\" stanza, please see\n # the online documentation at:\n #\n # https://www.nomadproject.io/docs/job-specification/artifact\n #\n artifact {\n source = \"${url}\"\n }\n\n # The \"template\" stanza instructs Nomad to manage a template, such as\n # a configuration file or script. This template can optionally pull data\n # from Consul or Vault to populate runtime configuration data.\n #\n # For more information and examples on the \"template\" stanza, please see\n # the online documentation at:\n #\n # https://www.nomadproject.io/docs/job-specification/template\n #\n template {\n change_mode = \"noop\"\n change_signal = \"SIGINT\"\n destination = \"secrets/alertmanager.yml\"\n left_delimiter = \"{{{\"\n right_delimiter = \"}}}\"\n data = \u003c\u003cEOH\nglobal:\n # The API URL to use for Slack notifications.\n slack_api_url: '${slack_api_url}'\n\n# The directory from which notification templates are read.\ntemplates:\n- '/etc/alertmanager/template/*.tmpl'\n\n#tls_config:\n# # CA certificate to validate the server certificate with.\n# ca_file: \u003cfilepath\u003e ]\n#\n# # Certificate and key files for client cert authentication to the server.\n# cert_file: \u003cfilepath\u003e\n# key_file: \u003cfilepath\u003e\n#\n# # ServerName extension to indicate the name of the server.\n# # http://tools.ietf.org/html/rfc4366#section-3.1\n# server_name: \u003cstring\u003e\n#\n# # Disable validation of the server certificate.\n# insecure_skip_verify: true\n\n# The root route on which each incoming alert enters.\nroute:\n receiver: '${default_receiver}'\n\n # The labels by which incoming alerts are grouped together. For example,\n # multiple alerts coming in for cluster=A and alertname=LatencyHigh would\n # be batched into a single group.\n #\n # To aggregate by all possible labels use '...' as the sole label name.\n # This effectively disables aggregation entirely, passing through all\n # alerts as-is. This is unlikely to be what you want, unless you have\n # a very low alert volume or your upstream notification system performs\n # its own grouping. Example: group_by: [...]\n group_by: ['alertname', 'cluster', 'service']\n\n # When a new group of alerts is created by an incoming alert, wait at\n # least 'group_wait' to send the initial notification.\n # This way ensures that you get multiple alerts for the same group that start\n # firing shortly after another are batched together on the first\n # notification.\n group_wait: 30s\n\n # When the first notification was sent, wait 'group_interval' to send a batch\n # of new alerts that started firing for that group.\n group_interval: 5m\n\n # If an alert has successfully been sent, wait 'repeat_interval' to\n # resend them.\n repeat_interval: 3h\n\n # All the above attributes are inherited by all child routes and can\n # overwritten on each.\n # The child route trees.\n routes:\n # This routes performs a regular expression match on alert labels to\n # catch alerts that are related to a list of services.\n - match_re:\n service: .*\n receiver: ${default_receiver}\n # The service has a sub-route for critical alerts, any alerts\n # that do not match, i.e. severity != critical, fall-back to the\n # parent node and are sent to 'team-X-mails'\n routes:\n - match:\n severity: critical\n receiver: '${default_receiver}'\n\n# Inhibition rules allow to mute a set of alerts given that another alert is\n# firing.\n# We use this to mute any warning-level notifications if the same alert is\n# already critical.\ninhibit_rules:\n- source_match:\n severity: 'critical'\n target_match:\n severity: 'warning'\n # Apply inhibition if the alertname is the same.\n # CAUTION:\n # If all label names listed in `equal` are missing\n # from both the source and target alerts,\n # the inhibition rule will apply!\n equal: ['alertname', 'cluster', 'service']\n\nreceivers:\n- name: '${default_receiver}'\n slack_configs:\n - channel: '#${slack_channel}'\n send_resolved: true\n icon_url: https://avatars3.githubusercontent.com/u/3380462\n title: |-\n [{{ .Status | toUpper }}{{ if eq .Status \"firing\" }}:{{ .Alerts.Firing | len }}{{ end }}] {{ .CommonLabels.alertname }} for {{ .CommonLabels.job }}\n {{- if gt (len .CommonLabels) (len .GroupLabels) -}}\n {{\" \"}}(\n {{- with .CommonLabels.Remove .GroupLabels.Names }}\n {{- range $index, $label := .SortedPairs -}}\n {{ if $index }}, {{ end }}\n {{- $label.Name }}=\"{{ $label.Value -}}\"\n {{- end }}\n {{- end -}}\n )\n {{- end }}\n text: \u003e-\n {{ range .Alerts -}}\n *Alert:* {{ .Annotations.summary }}{{ if .Labels.severity }} - `{{ .Labels.severity }}`{{ end }}\n\n *Description:* {{ .Annotations.description }}\n\n *Details:*\n {{ range .Labels.SortedPairs }} • *{{ .Name }}:* `{{ .Value }}`\n {{ end }}\n {{ end }}\nEOH\n }\n\n # The service stanza instructs Nomad to register a service with Consul.\n #\n # For more information and examples on the \"task\" stanza, please see\n # the online documentation at:\n #\n # https://www.nomadproject.io/docs/job-specification/service\n #\n service {\n name = \"${service_name}\"\n port = \"${service_name}\"\n tags = [ \"${service_name}$${NOMAD_ALLOC_INDEX}\" ]\n check {\n name = \"Alertmanager Check Live\"\n type = \"http\"\n path = \"/-/healthy\"\n interval = \"10s\"\n timeout = \"2s\"\n }\n }\n\n # The \"resources\" stanza describes the requirements a task needs to\n # execute. Resource requirements include memory, network, cpu, and more.\n # This ensures the task will execute on a machine that contains enough\n # resource capacity.\n #\n # For more information and examples on the \"resources\" stanza, please see\n # the online documentation at:\n #\n # https://www.nomadproject.io/docs/job-specification/resources\n #\n resources {\n cpu = ${cpu}\n memory = ${mem}\n # The network stanza specifies the networking requirements for the task\n # group, including the network mode and port allocations. When scheduling\n # jobs in Nomad they are provisioned across your fleet of machines along\n # with other jobs and services. Because you don't know in advance what host\n # your job will be provisioned on, Nomad will provide your tasks with\n # network configuration when they start up.\n #\n # For more information and examples on the \"template\" stanza, please see\n # the online documentation at:\n #\n # https://www.nomadproject.io/docs/job-specification/network\n #\n network {\n port \"${service_name}\" {\n static = ${port}\n }\n }\n }\n }\n }\n}",
+ "vars": {
+ "cpu": "1000",
+ "datacenters": "yul1",
+ "default_receiver": "default-receiver",
+ "group_count": "1",
+ "job_name": "prod-alertmanager",
+ "mem": "1024",
+ "port": "9093",
+ "service_name": "alertmanager",
+ "slack_api_url": "https://hooks.slack.com/services/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",
+ "slack_channel": "fdio-infra-monitoring",
+ "url": "https://github.com/prometheus/alertmanager/releases/download/v0.21.0/alertmanager-0.21.0.linux-amd64.tar.gz",
+ "use_canary": "true",
+ "use_vault_provider": "false",
+ "version": "0.21.0"
+ }
+ },
+ "sensitive_attributes": []
+ }
+ ]
+ },
+ {
+ "module": "module.alertmanager",
+ "mode": "managed",
+ "type": "nomad_job",
+ "name": "nomad_job_alertmanager",
+ "provider": "provider[\"registry.terraform.io/hashicorp/nomad\"].yul1",
+ "instances": [
+ {
+ "schema_version": 0,
+ "attributes": {
+ "allocation_ids": [
+ "464144d9-8c27-dc03-71bb-6f8d525bc486",
+ "d29e4d1a-be78-d7ec-0b53-c2b6ebcd2e00"
+ ],
+ "datacenters": [
+ "yul1"
+ ],
+ "deployment_id": "d69d73a6-2a1b-ebf6-e421-30ce1f34f519",
+ "deployment_status": "successful",
+ "deregister_on_destroy": true,
+ "deregister_on_id_change": true,
+ "detach": false,
+ "id": "prod-alertmanager",
+ "jobspec": "job \"prod-alertmanager\" {\n # The \"region\" parameter specifies the region in which to execute the job.\n # If omitted, this inherits the default region name of \"global\".\n # region = \"global\"\n #\n # The \"datacenters\" parameter specifies the list of datacenters which should\n # be considered when placing this task. This must be provided.\n datacenters = \"yul1\"\n\n # The \"type\" parameter controls the type of job, which impacts the scheduler's\n # decision on placement. This configuration is optional and defaults to\n # \"service\". For a full list of job types and their differences, please see\n # the online documentation.\n #\n # For more information, please see the online documentation at:\n #\n # https://www.nomadproject.io/docs/jobspec/schedulers\n #\n type = \"service\"\n\n update {\n # The \"max_parallel\" parameter specifies the maximum number of updates to\n # perform in parallel. In this case, this specifies to update a single task\n # at a time.\n max_parallel = 1\n\n health_check = \"checks\"\n\n # The \"min_healthy_time\" parameter specifies the minimum time the allocation\n # must be in the healthy state before it is marked as healthy and unblocks\n # further allocations from being updated.\n min_healthy_time = \"10s\"\n\n # The \"healthy_deadline\" parameter specifies the deadline in which the\n # allocation must be marked as healthy after which the allocation is\n # automatically transitioned to unhealthy. Transitioning to unhealthy will\n # fail the deployment and potentially roll back the job if \"auto_revert\" is\n # set to true.\n healthy_deadline = \"3m\"\n\n # The \"progress_deadline\" parameter specifies the deadline in which an\n # allocation must be marked as healthy. The deadline begins when the first\n # allocation for the deployment is created and is reset whenever an allocation\n # as part of the deployment transitions to a healthy state. If no allocation\n # transitions to the healthy state before the progress deadline, the\n # deployment is marked as failed.\n progress_deadline = \"10m\"\n\n\n # The \"canary\" parameter specifies that changes to the job that would result\n # in destructive updates should create the specified number of canaries\n # without stopping any previous allocations. Once the operator determines the\n # canaries are healthy, they can be promoted which unblocks a rolling update\n # of the remaining allocations at a rate of \"max_parallel\".\n #\n # Further, setting \"canary\" equal to the count of the task group allows\n # blue/green deployments. When the job is updated, a full set of the new\n # version is deployed and upon promotion the old version is stopped.\n canary = 1\n\n # Specifies if the job should auto-promote to the canary version when all\n # canaries become healthy during a deployment. Defaults to false which means\n # canaries must be manually updated with the nomad deployment promote\n # command.\n auto_promote = true\n\n # The \"auto_revert\" parameter specifies if the job should auto-revert to the\n # last stable job on deployment failure. A job is marked as stable if all the\n # allocations as part of its deployment were marked healthy.\n auto_revert = true\n\n }\n\n # The \"group\" stanza defines a series of tasks that should be co-located on\n # the same Nomad client. Any task within a group will be placed on the same\n # client.\n #\n # For more information and examples on the \"group\" stanza, please see\n # the online documentation at:\n #\n # https://www.nomadproject.io/docs/job-specification/group\n #\n group \"prod-group1-alertmanager\" {\n # The \"count\" parameter specifies the number of the task groups that should\n # be running under this group. This value must be non-negative and defaults\n # to 1.\n count = 1\n\n # The constraint allows restricting the set of eligible nodes. Constraints\n # may filter on attributes or client metadata.\n #\n # For more information and examples on the \"volume\" stanza, please see\n # the online documentation at:\n #\n # https://www.nomadproject.io/docs/job-specification/constraint\n #\n constraint {\n attribute = \"${attr.cpu.arch}\"\n operator = \"!=\"\n value = \"arm64\"\n }\n\n # The \"task\" stanza creates an individual unit of work, such as a Docker\n # container, web application, or batch processing.\n #\n # For more information and examples on the \"task\" stanza, please see\n # the online documentation at:\n #\n # https://www.nomadproject.io/docs/job-specification/task\n #\n task \"prod-task1-alertmanager\" {\n # The \"driver\" parameter specifies the task driver that should be used to\n # run the task.\n driver = \"exec\"\n\n \n\n # The \"config\" stanza specifies the driver configuration, which is passed\n # directly to the driver to start the task. The details of configurations\n # are specific to each driver, so please see specific driver\n # documentation for more information.\n config {\n command = \"local/alertmanager-0.21.0.linux-amd64/alertmanager\"\n args = [\n \"--config.file=secrets/alertmanager.yml\"\n ]\n }\n\n # The artifact stanza instructs Nomad to fetch and unpack a remote resource,\n # such as a file, tarball, or binary. Nomad downloads artifacts using the\n # popular go-getter library, which permits downloading artifacts from a\n # variety of locations using a URL as the input source.\n #\n # For more information and examples on the \"artifact\" stanza, please see\n # the online documentation at:\n #\n # https://www.nomadproject.io/docs/job-specification/artifact\n #\n artifact {\n source = \"https://github.com/prometheus/alertmanager/releases/download/v0.21.0/alertmanager-0.21.0.linux-amd64.tar.gz\"\n }\n\n # The \"template\" stanza instructs Nomad to manage a template, such as\n # a configuration file or script. This template can optionally pull data\n # from Consul or Vault to populate runtime configuration data.\n #\n # For more information and examples on the \"template\" stanza, please see\n # the online documentation at:\n #\n # https://www.nomadproject.io/docs/job-specification/template\n #\n template {\n change_mode = \"noop\"\n change_signal = \"SIGINT\"\n destination = \"secrets/alertmanager.yml\"\n left_delimiter = \"{{{\"\n right_delimiter = \"}}}\"\n data = \u003c\u003cEOH\nglobal:\n # The API URL to use for Slack notifications.\n slack_api_url: 'https://hooks.slack.com/services/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'\n\n# The directory from which notification templates are read.\ntemplates:\n- '/etc/alertmanager/template/*.tmpl'\n\n#tls_config:\n# # CA certificate to validate the server certificate with.\n# ca_file: \u003cfilepath\u003e ]\n#\n# # Certificate and key files for client cert authentication to the server.\n# cert_file: \u003cfilepath\u003e\n# key_file: \u003cfilepath\u003e\n#\n# # ServerName extension to indicate the name of the server.\n# # http://tools.ietf.org/html/rfc4366#section-3.1\n# server_name: \u003cstring\u003e\n#\n# # Disable validation of the server certificate.\n# insecure_skip_verify: true\n\n# The root route on which each incoming alert enters.\nroute:\n receiver: 'default-receiver'\n\n # The labels by which incoming alerts are grouped together. For example,\n # multiple alerts coming in for cluster=A and alertname=LatencyHigh would\n # be batched into a single group.\n #\n # To aggregate by all possible labels use '...' as the sole label name.\n # This effectively disables aggregation entirely, passing through all\n # alerts as-is. This is unlikely to be what you want, unless you have\n # a very low alert volume or your upstream notification system performs\n # its own grouping. Example: group_by: [...]\n group_by: ['alertname', 'cluster', 'service']\n\n # When a new group of alerts is created by an incoming alert, wait at\n # least 'group_wait' to send the initial notification.\n # This way ensures that you get multiple alerts for the same group that start\n # firing shortly after another are batched together on the first\n # notification.\n group_wait: 30s\n\n # When the first notification was sent, wait 'group_interval' to send a batch\n # of new alerts that started firing for that group.\n group_interval: 5m\n\n # If an alert has successfully been sent, wait 'repeat_interval' to\n # resend them.\n repeat_interval: 3h\n\n # All the above attributes are inherited by all child routes and can\n # overwritten on each.\n # The child route trees.\n routes:\n # This routes performs a regular expression match on alert labels to\n # catch alerts that are related to a list of services.\n - match_re:\n service: .*\n receiver: default-receiver\n # The service has a sub-route for critical alerts, any alerts\n # that do not match, i.e. severity != critical, fall-back to the\n # parent node and are sent to 'team-X-mails'\n routes:\n - match:\n severity: critical\n receiver: 'default-receiver'\n\n# Inhibition rules allow to mute a set of alerts given that another alert is\n# firing.\n# We use this to mute any warning-level notifications if the same alert is\n# already critical.\ninhibit_rules:\n- source_match:\n severity: 'critical'\n target_match:\n severity: 'warning'\n # Apply inhibition if the alertname is the same.\n # CAUTION:\n # If all label names listed in `equal` are missing\n # from both the source and target alerts,\n # the inhibition rule will apply!\n equal: ['alertname', 'cluster', 'service']\n\nreceivers:\n- name: 'default-receiver'\n slack_configs:\n - channel: '#fdio-infra-monitoring'\n send_resolved: true\n icon_url: https://avatars3.githubusercontent.com/u/3380462\n title: |-\n [{{ .Status | toUpper }}{{ if eq .Status \"firing\" }}:{{ .Alerts.Firing | len }}{{ end }}] {{ .CommonLabels.alertname }} for {{ .CommonLabels.job }}\n {{- if gt (len .CommonLabels) (len .GroupLabels) -}}\n {{\" \"}}(\n {{- with .CommonLabels.Remove .GroupLabels.Names }}\n {{- range $index, $label := .SortedPairs -}}\n {{ if $index }}, {{ end }}\n {{- $label.Name }}=\"{{ $label.Value -}}\"\n {{- end }}\n {{- end -}}\n )\n {{- end }}\n text: \u003e-\n {{ range .Alerts -}}\n *Alert:* {{ .Annotations.summary }}{{ if .Labels.severity }} - `{{ .Labels.severity }}`{{ end }}\n\n *Description:* {{ .Annotations.description }}\n\n *Details:*\n {{ range .Labels.SortedPairs }} • *{{ .Name }}:* `{{ .Value }}`\n {{ end }}\n {{ end }}\nEOH\n }\n\n # The service stanza instructs Nomad to register a service with Consul.\n #\n # For more information and examples on the \"task\" stanza, please see\n # the online documentation at:\n #\n # https://www.nomadproject.io/docs/job-specification/service\n #\n service {\n name = \"alertmanager\"\n port = \"alertmanager\"\n tags = [ \"alertmanager${NOMAD_ALLOC_INDEX}\" ]\n check {\n name = \"Alertmanager Check Live\"\n type = \"http\"\n path = \"/-/healthy\"\n interval = \"10s\"\n timeout = \"2s\"\n }\n }\n\n # The \"resources\" stanza describes the requirements a task needs to\n # execute. Resource requirements include memory, network, cpu, and more.\n # This ensures the task will execute on a machine that contains enough\n # resource capacity.\n #\n # For more information and examples on the \"resources\" stanza, please see\n # the online documentation at:\n #\n # https://www.nomadproject.io/docs/job-specification/resources\n #\n resources {\n cpu = 1000\n memory = 1024\n # The network stanza specifies the networking requirements for the task\n # group, including the network mode and port allocations. When scheduling\n # jobs in Nomad they are provisioned across your fleet of machines along\n # with other jobs and services. Because you don't know in advance what host\n # your job will be provisioned on, Nomad will provide your tasks with\n # network configuration when they start up.\n #\n # For more information and examples on the \"template\" stanza, please see\n # the online documentation at:\n #\n # https://www.nomadproject.io/docs/job-specification/network\n #\n network {\n port \"alertmanager\" {\n static = 9093\n }\n }\n }\n }\n }\n}",
+ "json": null,
+ "modify_index": "7147924",
+ "name": "prod-alertmanager",
+ "namespace": "default",
+ "policy_override": null,
+ "purge_on_destroy": null,
+ "region": "global",
+ "task_groups": [
+ {
+ "count": 1,
+ "meta": {},
+ "name": "prod-group1-alertmanager",
+ "task": [
+ {
+ "driver": "exec",
+ "meta": {},
+ "name": "prod-task1-alertmanager",
+ "volume_mounts": []
+ }
+ ],
+ "volumes": []
+ }
+ ],
+ "type": "service"
+ },
+ "sensitive_attributes": [],
+ "private": "bnVsbA==",
+ "dependencies": [
+ "module.alertmanager.data.template_file.nomad_job_alertmanager"
+ ]
+ }
+ ]
+ },
+ {
+ "module": "module.exporter",
+ "mode": "data",
+ "type": "template_file",
+ "name": "nomad_job_exporter",
+ "provider": "provider[\"registry.terraform.io/hashicorp/template\"]",
+ "instances": [
+ {
+ "schema_version": 0,
+ "attributes": {
+ "filename": null,
+ "id": "28db119a83617686c94496ed114455041d4fc500d93c4de8f8b8cfbe7d4c1db2",
+ "rendered": "job \"prod-exporter\" {\n # The \"region\" parameter specifies the region in which to execute the job.\n # If omitted, this inherits the default region name of \"global\".\n # region = \"global\"\n #\n # The \"datacenters\" parameter specifies the list of datacenters which should\n # be considered when placing this task. This must be provided.\n datacenters = \"yul1\"\n\n # The \"type\" parameter controls the type of job, which impacts the scheduler's\n # decision on placement. This configuration is optional and defaults to\n # \"service\". For a full list of job types and their differences, please see\n # the online documentation.\n #\n # https://www.nomadproject.io/docs/jobspec/schedulers\n #\n type = \"system\"\n\n update {\n # The \"max_parallel\" parameter specifies the maximum number of updates to\n # perform in parallel. In this case, this specifies to update a single task\n # at a time.\n max_parallel = 1\n\n health_check = \"checks\"\n\n # The \"min_healthy_time\" parameter specifies the minimum time the allocation\n # must be in the healthy state before it is marked as healthy and unblocks\n # further allocations from being updated.\n min_healthy_time = \"10s\"\n\n # The \"healthy_deadline\" parameter specifies the deadline in which the\n # allocation must be marked as healthy after which the allocation is\n # automatically transitioned to unhealthy. Transitioning to unhealthy will\n # fail the deployment and potentially roll back the job if \"auto_revert\" is\n # set to true.\n healthy_deadline = \"3m\"\n\n # The \"progress_deadline\" parameter specifies the deadline in which an\n # allocation must be marked as healthy. The deadline begins when the first\n # allocation for the deployment is created and is reset whenever an allocation\n # as part of the deployment transitions to a healthy state. If no allocation\n # transitions to the healthy state before the progress deadline, the\n # deployment is marked as failed.\n progress_deadline = \"10m\"\n\n\n }\n\n # The \"group\" stanza defines a series of tasks that should be co-located on\n # the same Nomad client. Any task within a group will be placed on the same\n # client.\n #\n # https://www.nomadproject.io/docs/job-specification/group\n #\n group \"prod-group1-exporter-amd64\" {\n # The constraint allows restricting the set of eligible nodes. Constraints\n # may filter on attributes or client metadata.\n #\n # https://www.nomadproject.io/docs/job-specification/constraint\n #\n constraint {\n attribute = \"${attr.cpu.arch}\"\n operator = \"!=\"\n value = \"arm64\"\n }\n\n # The \"task\" stanza creates an individual unit of work, such as a Docker\n # container, web application, or batch processing.\n #\n # https://www.nomadproject.io/docs/job-specification/task\n #\n task \"prod-task1-nodeexporter-amd64\" {\n # The \"driver\" parameter specifies the task driver that should be used to\n # run the task.\n driver = \"raw_exec\"\n\n # The \"config\" stanza specifies the driver configuration, which is passed\n # directly to the driver to start the task. The details of configurations\n # are specific to each driver, so please see specific driver\n # documentation for more information.\n config {\n command = \"local/node_exporter-1.0.1.linux-amd64/node_exporter\"\n }\n\n # The artifact stanza instructs Nomad to fetch and unpack a remote resource,\n # such as a file, tarball, or binary. Nomad downloads artifacts using the\n # popular go-getter library, which permits downloading artifacts from a\n # variety of locations using a URL as the input source.\n #\n # https://www.nomadproject.io/docs/job-specification/artifact\n #\n artifact {\n source = \"https://github.com/prometheus/node_exporter/releases/download/v1.0.1/node_exporter-1.0.1.linux-amd64.tar.gz\"\n }\n\n # The service stanza instructs Nomad to register a service with Consul.\n #\n # https://www.nomadproject.io/docs/job-specification/service\n #\n service {\n name = \"nodeexporter\"\n port = \"nodeexporter\"\n check {\n name = \"Node Exporter Check Live\"\n type = \"http\"\n path = \"/metrics\"\n interval = \"10s\"\n timeout = \"2s\"\n }\n }\n\n # The \"resources\" stanza describes the requirements a task needs to\n # execute. Resource requirements include memory, network, cpu, and more.\n # This ensures the task will execute on a machine that contains enough\n # resource capacity.\n #\n # https://www.nomadproject.io/docs/job-specification/resources\n #\n resources {\n cpu = 500\n # The network stanza specifies the networking requirements for the task\n # group, including the network mode and port allocations. When scheduling\n # jobs in Nomad they are provisioned across your fleet of machines along\n # with other jobs and services. Because you don't know in advance what host\n # your job will be provisioned on, Nomad will provide your tasks with\n # network configuration when they start up.\n #\n # https://www.nomadproject.io/docs/job-specification/network\n #\n network {\n port \"nodeexporter\" {\n static = 9100\n }\n }\n }\n }\n task \"prod-task2-blackboxexporter-amd64\" {\n # The \"driver\" parameter specifies the task driver that should be used to\n # run the task.\n driver = \"exec\"\n\n # The \"config\" stanza specifies the driver configuration, which is passed\n # directly to the driver to start the task. The details of configurations\n # are specific to each driver, so please see specific driver\n # documentation for more information.\n config {\n command = \"local/blackbox_exporter-0.18.0.linux-amd64/blackbox_exporter\"\n args = [\n \"--config.file=secrets/blackbox.yml\"\n ]\n }\n\n # The \"template\" stanza instructs Nomad to manage a template, such as\n # a configuration file or script. This template can optionally pull data\n # from Consul or Vault to populate runtime configuration data.\n #\n # https://www.nomadproject.io/docs/job-specification/template\n #\n template {\n change_mode = \"noop\"\n change_signal = \"SIGINT\"\n destination = \"secrets/blackbox.yml\"\n data = \u003c\u003cEOH\nmodules:\n http_2xx:\n prober: http\n timeout: 5s\n http:\n valid_http_versions: [\"HTTP/1.1\", \"HTTP/2.0\"]\n no_follow_redirects: false\n fail_if_ssl: false\n fail_if_not_ssl: true\n tls_config:\n insecure_skip_verify: false\n preferred_ip_protocol: \"ip4\"\n icmp_v4:\n prober: icmp\n timeout: 5s\n icmp:\n preferred_ip_protocol: \"ip4\"\n dns_udp:\n prober: dns\n timeout: 5s\n dns:\n query_name: \"jenkins.fd.io\"\n query_type: \"A\"\n valid_rcodes:\n - NOERROR\nEOH\n }\n\n # The artifact stanza instructs Nomad to fetch and unpack a remote resource,\n # such as a file, tarball, or binary. Nomad downloads artifacts using the\n # popular go-getter library, which permits downloading artifacts from a\n # variety of locations using a URL as the input source.\n #\n # https://www.nomadproject.io/docs/job-specification/artifact\n #\n artifact {\n source = \"https://github.com/prometheus/blackbox_exporter/releases/download/v0.18.0/blackbox_exporter-0.18.0.linux-amd64.tar.gz\"\n }\n\n # The service stanza instructs Nomad to register a service with Consul.\n #\n # https://www.nomadproject.io/docs/job-specification/service\n #\n service {\n name = \"blackboxexporter\"\n port = \"blackboxexporter\"\n tags = [ \"blackboxexporter${NOMAD_ALLOC_INDEX}\" ]\n check {\n name = \"Blackbox Exporter Check Live\"\n type = \"http\"\n path = \"/metrics\"\n interval = \"10s\"\n timeout = \"2s\"\n }\n }\n\n # The \"resources\" stanza describes the requirements a task needs to\n # execute. Resource requirements include memory, network, cpu, and more.\n # This ensures the task will execute on a machine that contains enough\n # resource capacity.\n #\n # https://www.nomadproject.io/docs/job-specification/resources\n #\n resources {\n cpu = 500\n # The network stanza specifies the networking requirements for the task\n # group, including the network mode and port allocations. When scheduling\n # jobs in Nomad they are provisioned across your fleet of machines along\n # with other jobs and services. Because you don't know in advance what host\n # your job will be provisioned on, Nomad will provide your tasks with\n # network configuration when they start up.\n #\n # https://www.nomadproject.io/docs/job-specification/network\n #\n network {\n port \"blackboxexporter\" {\n static = 9115\n }\n }\n }\n }\n\n task \"prod-task3-cadvisorexporter-amd64\" {\n # The \"driver\" parameter specifies the task driver that should be used to\n # run the task.\n driver = \"docker\"\n\n # The \"config\" stanza specifies the driver configuration, which is passed\n # directly to the driver to start the task. The details of configurations\n # are specific to each driver, so please see specific driver\n # documentation for more information.\n config {\n image = \"gcr.io/cadvisor/cadvisor:latest\"\n volumes = [\n \"/:/rootfs:ro\",\n \"/var/run:/var/run:rw\",\n \"/sys:/sys:ro\",\n \"/var/lib/docker/:/var/lib/docker:ro\",\n \"/cgroup:/cgroup:ro\"\n ]\n }\n\n # The service stanza instructs Nomad to register a service with Consul.\n #\n # https://www.nomadproject.io/docs/job-specification/service\n #\n service {\n name = \"cadvisorexporter\"\n port = \"cadvisorexporter\"\n check {\n name = \"cAdvisor Check Live\"\n type = \"http\"\n path = \"/metrics\"\n interval = \"10s\"\n timeout = \"2s\"\n }\n }\n\n # The \"resources\" stanza describes the requirements a task needs to\n # execute. Resource requirements include memory, network, cpu, and more.\n # This ensures the task will execute on a machine that contains enough\n # resource capacity.\n #\n # https://www.nomadproject.io/docs/job-specification/resources\n #\n resources {\n cpu = 500\n # The network stanza specifies the networking requirements for the task\n # group, including the network mode and port allocations. When scheduling\n # jobs in Nomad they are provisioned across your fleet of machines along\n # with other jobs and services. Because you don't know in advance what host\n # your job will be provisioned on, Nomad will provide your tasks with\n # network configuration when they start up.\n #\n # https://www.nomadproject.io/docs/job-specification/network\n #\n network {\n port \"cadvisorexporter\" {\n static = 8080\n }\n }\n }\n }\n }\n\n group \"prod-group1-exporter-arm64\" {\n # The constraint allows restricting the set of eligible nodes. Constraints\n # may filter on attributes or client metadata.\n #\n # https://www.nomadproject.io/docs/job-specification/constraint\n #\n constraint {\n attribute = \"${attr.cpu.arch}\"\n operator = \"==\"\n value = \"arm64\"\n }\n\n # The \"task\" stanza creates an individual unit of work, such as a Docker\n # container, web application, or batch processing.\n #\n # https://www.nomadproject.io/docs/job-specification/task\n #\n task \"prod-task1-nodeexporter-arm64\" {\n # The \"driver\" parameter specifies the task driver that should be used to\n # run the task.\n driver = \"raw_exec\"\n\n # The \"config\" stanza specifies the driver configuration, which is passed\n # directly to the driver to start the task. The details of configurations\n # are specific to each driver, so please see specific driver\n # documentation for more information.\n config {\n command = \"local/node_exporter-1.0.1.linux-arm64/node_exporter\"\n }\n\n # The artifact stanza instructs Nomad to fetch and unpack a remote resource,\n # such as a file, tarball, or binary. Nomad downloads artifacts using the\n # popular go-getter library, which permits downloading artifacts from a\n # variety of locations using a URL as the input source.\n #\n # https://www.nomadproject.io/docs/job-specification/artifact\n #\n artifact {\n source = \"https://github.com/prometheus/node_exporter/releases/download/v1.0.1/node_exporter-1.0.1.linux-arm64.tar.gz\"\n }\n\n # The service stanza instructs Nomad to register a service with Consul.\n #\n # https://www.nomadproject.io/docs/job-specification/service\n #\n service {\n name = \"nodeexporter\"\n port = \"nodeexporter\"\n check {\n name = \"Node Exporter Check Live\"\n type = \"http\"\n path = \"/metrics\"\n interval = \"10s\"\n timeout = \"2s\"\n }\n }\n\n # The \"resources\" stanza describes the requirements a task needs to\n # execute. Resource requirements include memory, network, cpu, and more.\n # This ensures the task will execute on a machine that contains enough\n # resource capacity.\n #\n # https://www.nomadproject.io/docs/job-specification/resources\n #\n resources {\n cpu = 500\n # The network stanza specifies the networking requirements for the task\n # group, including the network mode and port allocations. When scheduling\n # jobs in Nomad they are provisioned across your fleet of machines along\n # with other jobs and services. Because you don't know in advance what host\n # your job will be provisioned on, Nomad will provide your tasks with\n # network configuration when they start up.\n #\n # https://www.nomadproject.io/docs/job-specification/network\n #\n network {\n port \"nodeexporter\" {\n static = 9100\n }\n }\n }\n }\n\n task \"prod-task2-blackboxexporter-arm64\" {\n # The \"driver\" parameter specifies the task driver that should be used to\n # run the task.\n driver = \"exec\"\n\n # The \"config\" stanza specifies the driver configuration, which is passed\n # directly to the driver to start the task. The details of configurations\n # are specific to each driver, so please see specific driver\n # documentation for more information.\n config {\n command = \"local/blackbox_exporter-0.18.0.linux-arm64/blackbox_exporter\"\n args = [\n \"--config.file=secrets/blackbox.yml\"\n ]\n }\n\n # The \"template\" stanza instructs Nomad to manage a template, such as\n # a configuration file or script. This template can optionally pull data\n # from Consul or Vault to populate runtime configuration data.\n #\n # https://www.nomadproject.io/docs/job-specification/template\n #\n template {\n change_mode = \"noop\"\n change_signal = \"SIGINT\"\n destination = \"secrets/blackbox.yml\"\n data = \u003c\u003cEOH\nmodules:\n http_2xx:\n prober: http\n timeout: 5s\n http:\n valid_http_versions: [\"HTTP/1.1\", \"HTTP/2.0\"]\n no_follow_redirects: false\n fail_if_ssl: false\n fail_if_not_ssl: true\n tls_config:\n insecure_skip_verify: false\n preferred_ip_protocol: \"ip4\"\n icmp_v4:\n prober: icmp\n timeout: 5s\n icmp:\n preferred_ip_protocol: \"ip4\"\n dns_udp:\n prober: dns\n timeout: 5s\n dns:\n query_name: \"jenkins.fd.io\"\n query_type: \"A\"\n valid_rcodes:\n - NOERROR\nEOH\n }\n\n # The artifact stanza instructs Nomad to fetch and unpack a remote resource,\n # such as a file, tarball, or binary. Nomad downloads artifacts using the\n # popular go-getter library, which permits downloading artifacts from a\n # variety of locations using a URL as the input source.\n #\n # https://www.nomadproject.io/docs/job-specification/artifact\n #\n artifact {\n source = \"https://github.com/prometheus/blackbox_exporter/releases/download/v0.18.0/blackbox_exporter-0.18.0.linux-arm64.tar.gz\"\n }\n\n # The service stanza instructs Nomad to register a service with Consul.\n #\n # https://www.nomadproject.io/docs/job-specification/service\n #\n service {\n name = \"blackboxexporter\"\n port = \"blackboxexporter\"\n tags = [ \"blackboxexporter${NOMAD_ALLOC_INDEX}\" ]\n check {\n name = \"Blackbox Exporter Check Live\"\n type = \"http\"\n path = \"/metrics\"\n interval = \"10s\"\n timeout = \"2s\"\n }\n }\n\n # The \"resources\" stanza describes the requirements a task needs to\n # execute. Resource requirements include memory, network, cpu, and more.\n # This ensures the task will execute on a machine that contains enough\n # resource capacity.\n #\n # https://www.nomadproject.io/docs/job-specification/resources\n #\n resources {\n cpu = 500\n # The network stanza specifies the networking requirements for the task\n # group, including the network mode and port allocations. When scheduling\n # jobs in Nomad they are provisioned across your fleet of machines along\n # with other jobs and services. Because you don't know in advance what host\n # your job will be provisioned on, Nomad will provide your tasks with\n # network configuration when they start up.\n #\n # https://www.nomadproject.io/docs/job-specification/network\n #\n network {\n port \"blackboxexporter\" {\n static = 9115\n }\n }\n }\n }\n\n task \"prod-task3-cadvisorexporter-arm64\" {\n # The \"driver\" parameter specifies the task driver that should be used to\n # run the task.\n driver = \"docker\"\n\n # The \"config\" stanza specifies the driver configuration, which is passed\n # directly to the driver to start the task. The details of configurations\n # are specific to each driver, so please see specific driver\n # documentation for more information.\n config {\n # There is currently no official release for arm yet...using community.\n image = \"zcube/cadvisor:latest\"\n volumes = [\n \"/:/rootfs:ro\",\n \"/var/run:/var/run:rw\",\n \"/sys:/sys:ro\",\n \"/var/lib/docker/:/var/lib/docker:ro\",\n \"/cgroup:/cgroup:ro\"\n ]\n }\n\n # The service stanza instructs Nomad to register a service with Consul.\n #\n # https://www.nomadproject.io/docs/job-specification/service\n #\n service {\n name = \"cadvisorexporter\"\n port = \"cadvisorexporter\"\n check {\n name = \"cAdvisor Check Live\"\n type = \"http\"\n path = \"/metrics\"\n interval = \"10s\"\n timeout = \"2s\"\n }\n }\n\n # The \"resources\" stanza describes the requirements a task needs to\n # execute. Resource requirements include memory, network, cpu, and more.\n # This ensures the task will execute on a machine that contains enough\n # resource capacity.\n #\n # https://www.nomadproject.io/docs/job-specification/resources\n #\n resources {\n cpu = 500\n # The network stanza specifies the networking requirements for the task\n # group, including the network mode and port allocations. When scheduling\n # jobs in Nomad they are provisioned across your fleet of machines along\n # with other jobs and services. Because you don't know in advance what host\n # your job will be provisioned on, Nomad will provide your tasks with\n # network configuration when they start up.\n #\n # https://www.nomadproject.io/docs/job-specification/network\n #\n network {\n port \"cadvisorexporter\" {\n static = 8080\n }\n }\n }\n }\n }\n}",
+ "template": "job \"${job_name}\" {\n # The \"region\" parameter specifies the region in which to execute the job.\n # If omitted, this inherits the default region name of \"global\".\n # region = \"global\"\n #\n # The \"datacenters\" parameter specifies the list of datacenters which should\n # be considered when placing this task. This must be provided.\n datacenters = \"${datacenters}\"\n\n # The \"type\" parameter controls the type of job, which impacts the scheduler's\n # decision on placement. This configuration is optional and defaults to\n # \"service\". For a full list of job types and their differences, please see\n # the online documentation.\n #\n # https://www.nomadproject.io/docs/jobspec/schedulers\n #\n type = \"system\"\n\n update {\n # The \"max_parallel\" parameter specifies the maximum number of updates to\n # perform in parallel. In this case, this specifies to update a single task\n # at a time.\n max_parallel = 1\n\n health_check = \"checks\"\n\n # The \"min_healthy_time\" parameter specifies the minimum time the allocation\n # must be in the healthy state before it is marked as healthy and unblocks\n # further allocations from being updated.\n min_healthy_time = \"10s\"\n\n # The \"healthy_deadline\" parameter specifies the deadline in which the\n # allocation must be marked as healthy after which the allocation is\n # automatically transitioned to unhealthy. Transitioning to unhealthy will\n # fail the deployment and potentially roll back the job if \"auto_revert\" is\n # set to true.\n healthy_deadline = \"3m\"\n\n # The \"progress_deadline\" parameter specifies the deadline in which an\n # allocation must be marked as healthy. The deadline begins when the first\n # allocation for the deployment is created and is reset whenever an allocation\n # as part of the deployment transitions to a healthy state. If no allocation\n # transitions to the healthy state before the progress deadline, the\n # deployment is marked as failed.\n progress_deadline = \"10m\"\n\n%{ if use_canary }\n # The \"canary\" parameter specifies that changes to the job that would result\n # in destructive updates should create the specified number of canaries\n # without stopping any previous allocations. Once the operator determines the\n # canaries are healthy, they can be promoted which unblocks a rolling update\n # of the remaining allocations at a rate of \"max_parallel\".\n #\n # Further, setting \"canary\" equal to the count of the task group allows\n # blue/green deployments. When the job is updated, a full set of the new\n # version is deployed and upon promotion the old version is stopped.\n canary = 1\n\n # Specifies if the job should auto-promote to the canary version when all\n # canaries become healthy during a deployment. Defaults to false which means\n # canaries must be manually updated with the nomad deployment promote\n # command.\n auto_promote = true\n\n # The \"auto_revert\" parameter specifies if the job should auto-revert to the\n # last stable job on deployment failure. A job is marked as stable if all the\n # allocations as part of its deployment were marked healthy.\n auto_revert = true\n%{ endif }\n }\n\n # The \"group\" stanza defines a series of tasks that should be co-located on\n # the same Nomad client. Any task within a group will be placed on the same\n # client.\n #\n # https://www.nomadproject.io/docs/job-specification/group\n #\n group \"prod-group1-exporter-amd64\" {\n # The constraint allows restricting the set of eligible nodes. Constraints\n # may filter on attributes or client metadata.\n #\n # https://www.nomadproject.io/docs/job-specification/constraint\n #\n constraint {\n attribute = \"$${attr.cpu.arch}\"\n operator = \"!=\"\n value = \"arm64\"\n }\n\n # The \"task\" stanza creates an individual unit of work, such as a Docker\n # container, web application, or batch processing.\n #\n # https://www.nomadproject.io/docs/job-specification/task\n #\n task \"prod-task1-${node_service_name}-amd64\" {\n # The \"driver\" parameter specifies the task driver that should be used to\n # run the task.\n driver = \"raw_exec\"\n\n # The \"config\" stanza specifies the driver configuration, which is passed\n # directly to the driver to start the task. The details of configurations\n # are specific to each driver, so please see specific driver\n # documentation for more information.\n config {\n command = \"local/node_exporter-${node_version}.linux-amd64/node_exporter\"\n }\n\n # The artifact stanza instructs Nomad to fetch and unpack a remote resource,\n # such as a file, tarball, or binary. Nomad downloads artifacts using the\n # popular go-getter library, which permits downloading artifacts from a\n # variety of locations using a URL as the input source.\n #\n # https://www.nomadproject.io/docs/job-specification/artifact\n #\n artifact {\n source = \"${node_url_amd64}\"\n }\n\n # The service stanza instructs Nomad to register a service with Consul.\n #\n # https://www.nomadproject.io/docs/job-specification/service\n #\n service {\n name = \"${node_service_name}\"\n port = \"${node_service_name}\"\n check {\n name = \"Node Exporter Check Live\"\n type = \"http\"\n path = \"/metrics\"\n interval = \"10s\"\n timeout = \"2s\"\n }\n }\n\n # The \"resources\" stanza describes the requirements a task needs to\n # execute. Resource requirements include memory, network, cpu, and more.\n # This ensures the task will execute on a machine that contains enough\n # resource capacity.\n #\n # https://www.nomadproject.io/docs/job-specification/resources\n #\n resources {\n cpu = 500\n # The network stanza specifies the networking requirements for the task\n # group, including the network mode and port allocations. When scheduling\n # jobs in Nomad they are provisioned across your fleet of machines along\n # with other jobs and services. Because you don't know in advance what host\n # your job will be provisioned on, Nomad will provide your tasks with\n # network configuration when they start up.\n #\n # https://www.nomadproject.io/docs/job-specification/network\n #\n network {\n port \"${node_service_name}\" {\n static = ${node_port}\n }\n }\n }\n }\n task \"prod-task2-${blackbox_service_name}-amd64\" {\n # The \"driver\" parameter specifies the task driver that should be used to\n # run the task.\n driver = \"exec\"\n\n # The \"config\" stanza specifies the driver configuration, which is passed\n # directly to the driver to start the task. The details of configurations\n # are specific to each driver, so please see specific driver\n # documentation for more information.\n config {\n command = \"local/blackbox_exporter-${blackbox_version}.linux-amd64/blackbox_exporter\"\n args = [\n \"--config.file=secrets/blackbox.yml\"\n ]\n }\n\n # The \"template\" stanza instructs Nomad to manage a template, such as\n # a configuration file or script. This template can optionally pull data\n # from Consul or Vault to populate runtime configuration data.\n #\n # https://www.nomadproject.io/docs/job-specification/template\n #\n template {\n change_mode = \"noop\"\n change_signal = \"SIGINT\"\n destination = \"secrets/blackbox.yml\"\n data = \u003c\u003cEOH\nmodules:\n http_2xx:\n prober: http\n timeout: 5s\n http:\n valid_http_versions: [\"HTTP/1.1\", \"HTTP/2.0\"]\n no_follow_redirects: false\n fail_if_ssl: false\n fail_if_not_ssl: true\n tls_config:\n insecure_skip_verify: false\n preferred_ip_protocol: \"ip4\"\n icmp_v4:\n prober: icmp\n timeout: 5s\n icmp:\n preferred_ip_protocol: \"ip4\"\n dns_udp:\n prober: dns\n timeout: 5s\n dns:\n query_name: \"jenkins.fd.io\"\n query_type: \"A\"\n valid_rcodes:\n - NOERROR\nEOH\n }\n\n # The artifact stanza instructs Nomad to fetch and unpack a remote resource,\n # such as a file, tarball, or binary. Nomad downloads artifacts using the\n # popular go-getter library, which permits downloading artifacts from a\n # variety of locations using a URL as the input source.\n #\n # https://www.nomadproject.io/docs/job-specification/artifact\n #\n artifact {\n source = \"${blackbox_url_amd64}\"\n }\n\n # The service stanza instructs Nomad to register a service with Consul.\n #\n # https://www.nomadproject.io/docs/job-specification/service\n #\n service {\n name = \"${blackbox_service_name}\"\n port = \"${blackbox_service_name}\"\n tags = [ \"${blackbox_service_name}$${NOMAD_ALLOC_INDEX}\" ]\n check {\n name = \"Blackbox Exporter Check Live\"\n type = \"http\"\n path = \"/metrics\"\n interval = \"10s\"\n timeout = \"2s\"\n }\n }\n\n # The \"resources\" stanza describes the requirements a task needs to\n # execute. Resource requirements include memory, network, cpu, and more.\n # This ensures the task will execute on a machine that contains enough\n # resource capacity.\n #\n # https://www.nomadproject.io/docs/job-specification/resources\n #\n resources {\n cpu = 500\n # The network stanza specifies the networking requirements for the task\n # group, including the network mode and port allocations. When scheduling\n # jobs in Nomad they are provisioned across your fleet of machines along\n # with other jobs and services. Because you don't know in advance what host\n # your job will be provisioned on, Nomad will provide your tasks with\n # network configuration when they start up.\n #\n # https://www.nomadproject.io/docs/job-specification/network\n #\n network {\n port \"${blackbox_service_name}\" {\n static = ${blackbox_port}\n }\n }\n }\n }\n\n task \"prod-task3-${cadvisor_service_name}-amd64\" {\n # The \"driver\" parameter specifies the task driver that should be used to\n # run the task.\n driver = \"docker\"\n\n # The \"config\" stanza specifies the driver configuration, which is passed\n # directly to the driver to start the task. The details of configurations\n # are specific to each driver, so please see specific driver\n # documentation for more information.\n config {\n image = \"${cadvisor_image}\"\n volumes = [\n \"/:/rootfs:ro\",\n \"/var/run:/var/run:rw\",\n \"/sys:/sys:ro\",\n \"/var/lib/docker/:/var/lib/docker:ro\",\n \"/cgroup:/cgroup:ro\"\n ]\n }\n\n # The service stanza instructs Nomad to register a service with Consul.\n #\n # https://www.nomadproject.io/docs/job-specification/service\n #\n service {\n name = \"${cadvisor_service_name}\"\n port = \"${cadvisor_service_name}\"\n check {\n name = \"cAdvisor Check Live\"\n type = \"http\"\n path = \"/metrics\"\n interval = \"10s\"\n timeout = \"2s\"\n }\n }\n\n # The \"resources\" stanza describes the requirements a task needs to\n # execute. Resource requirements include memory, network, cpu, and more.\n # This ensures the task will execute on a machine that contains enough\n # resource capacity.\n #\n # https://www.nomadproject.io/docs/job-specification/resources\n #\n resources {\n cpu = 500\n # The network stanza specifies the networking requirements for the task\n # group, including the network mode and port allocations. When scheduling\n # jobs in Nomad they are provisioned across your fleet of machines along\n # with other jobs and services. Because you don't know in advance what host\n # your job will be provisioned on, Nomad will provide your tasks with\n # network configuration when they start up.\n #\n # https://www.nomadproject.io/docs/job-specification/network\n #\n network {\n port \"${cadvisor_service_name}\" {\n static = ${cadvisor_port}\n }\n }\n }\n }\n }\n\n group \"prod-group1-exporter-arm64\" {\n # The constraint allows restricting the set of eligible nodes. Constraints\n # may filter on attributes or client metadata.\n #\n # https://www.nomadproject.io/docs/job-specification/constraint\n #\n constraint {\n attribute = \"$${attr.cpu.arch}\"\n operator = \"==\"\n value = \"arm64\"\n }\n\n # The \"task\" stanza creates an individual unit of work, such as a Docker\n # container, web application, or batch processing.\n #\n # https://www.nomadproject.io/docs/job-specification/task\n #\n task \"prod-task1-${node_service_name}-arm64\" {\n # The \"driver\" parameter specifies the task driver that should be used to\n # run the task.\n driver = \"raw_exec\"\n\n # The \"config\" stanza specifies the driver configuration, which is passed\n # directly to the driver to start the task. The details of configurations\n # are specific to each driver, so please see specific driver\n # documentation for more information.\n config {\n command = \"local/node_exporter-${node_version}.linux-arm64/node_exporter\"\n }\n\n # The artifact stanza instructs Nomad to fetch and unpack a remote resource,\n # such as a file, tarball, or binary. Nomad downloads artifacts using the\n # popular go-getter library, which permits downloading artifacts from a\n # variety of locations using a URL as the input source.\n #\n # https://www.nomadproject.io/docs/job-specification/artifact\n #\n artifact {\n source = \"${node_url_arm64}\"\n }\n\n # The service stanza instructs Nomad to register a service with Consul.\n #\n # https://www.nomadproject.io/docs/job-specification/service\n #\n service {\n name = \"${node_service_name}\"\n port = \"${node_service_name}\"\n check {\n name = \"Node Exporter Check Live\"\n type = \"http\"\n path = \"/metrics\"\n interval = \"10s\"\n timeout = \"2s\"\n }\n }\n\n # The \"resources\" stanza describes the requirements a task needs to\n # execute. Resource requirements include memory, network, cpu, and more.\n # This ensures the task will execute on a machine that contains enough\n # resource capacity.\n #\n # https://www.nomadproject.io/docs/job-specification/resources\n #\n resources {\n cpu = 500\n # The network stanza specifies the networking requirements for the task\n # group, including the network mode and port allocations. When scheduling\n # jobs in Nomad they are provisioned across your fleet of machines along\n # with other jobs and services. Because you don't know in advance what host\n # your job will be provisioned on, Nomad will provide your tasks with\n # network configuration when they start up.\n #\n # https://www.nomadproject.io/docs/job-specification/network\n #\n network {\n port \"${node_service_name}\" {\n static = ${node_port}\n }\n }\n }\n }\n\n task \"prod-task2-${blackbox_service_name}-arm64\" {\n # The \"driver\" parameter specifies the task driver that should be used to\n # run the task.\n driver = \"exec\"\n\n # The \"config\" stanza specifies the driver configuration, which is passed\n # directly to the driver to start the task. The details of configurations\n # are specific to each driver, so please see specific driver\n # documentation for more information.\n config {\n command = \"local/blackbox_exporter-${blackbox_version}.linux-arm64/blackbox_exporter\"\n args = [\n \"--config.file=secrets/blackbox.yml\"\n ]\n }\n\n # The \"template\" stanza instructs Nomad to manage a template, such as\n # a configuration file or script. This template can optionally pull data\n # from Consul or Vault to populate runtime configuration data.\n #\n # https://www.nomadproject.io/docs/job-specification/template\n #\n template {\n change_mode = \"noop\"\n change_signal = \"SIGINT\"\n destination = \"secrets/blackbox.yml\"\n data = \u003c\u003cEOH\nmodules:\n http_2xx:\n prober: http\n timeout: 5s\n http:\n valid_http_versions: [\"HTTP/1.1\", \"HTTP/2.0\"]\n no_follow_redirects: false\n fail_if_ssl: false\n fail_if_not_ssl: true\n tls_config:\n insecure_skip_verify: false\n preferred_ip_protocol: \"ip4\"\n icmp_v4:\n prober: icmp\n timeout: 5s\n icmp:\n preferred_ip_protocol: \"ip4\"\n dns_udp:\n prober: dns\n timeout: 5s\n dns:\n query_name: \"jenkins.fd.io\"\n query_type: \"A\"\n valid_rcodes:\n - NOERROR\nEOH\n }\n\n # The artifact stanza instructs Nomad to fetch and unpack a remote resource,\n # such as a file, tarball, or binary. Nomad downloads artifacts using the\n # popular go-getter library, which permits downloading artifacts from a\n # variety of locations using a URL as the input source.\n #\n # https://www.nomadproject.io/docs/job-specification/artifact\n #\n artifact {\n source = \"${blackbox_url_arm64}\"\n }\n\n # The service stanza instructs Nomad to register a service with Consul.\n #\n # https://www.nomadproject.io/docs/job-specification/service\n #\n service {\n name = \"${blackbox_service_name}\"\n port = \"${blackbox_service_name}\"\n tags = [ \"${blackbox_service_name}$${NOMAD_ALLOC_INDEX}\" ]\n check {\n name = \"Blackbox Exporter Check Live\"\n type = \"http\"\n path = \"/metrics\"\n interval = \"10s\"\n timeout = \"2s\"\n }\n }\n\n # The \"resources\" stanza describes the requirements a task needs to\n # execute. Resource requirements include memory, network, cpu, and more.\n # This ensures the task will execute on a machine that contains enough\n # resource capacity.\n #\n # https://www.nomadproject.io/docs/job-specification/resources\n #\n resources {\n cpu = 500\n # The network stanza specifies the networking requirements for the task\n # group, including the network mode and port allocations. When scheduling\n # jobs in Nomad they are provisioned across your fleet of machines along\n # with other jobs and services. Because you don't know in advance what host\n # your job will be provisioned on, Nomad will provide your tasks with\n # network configuration when they start up.\n #\n # https://www.nomadproject.io/docs/job-specification/network\n #\n network {\n port \"${blackbox_service_name}\" {\n static = ${blackbox_port}\n }\n }\n }\n }\n\n task \"prod-task3-${cadvisor_service_name}-arm64\" {\n # The \"driver\" parameter specifies the task driver that should be used to\n # run the task.\n driver = \"docker\"\n\n # The \"config\" stanza specifies the driver configuration, which is passed\n # directly to the driver to start the task. The details of configurations\n # are specific to each driver, so please see specific driver\n # documentation for more information.\n config {\n # There is currently no official release for arm yet...using community.\n image = \"zcube/cadvisor:latest\"\n volumes = [\n \"/:/rootfs:ro\",\n \"/var/run:/var/run:rw\",\n \"/sys:/sys:ro\",\n \"/var/lib/docker/:/var/lib/docker:ro\",\n \"/cgroup:/cgroup:ro\"\n ]\n }\n\n # The service stanza instructs Nomad to register a service with Consul.\n #\n # https://www.nomadproject.io/docs/job-specification/service\n #\n service {\n name = \"${cadvisor_service_name}\"\n port = \"${cadvisor_service_name}\"\n check {\n name = \"cAdvisor Check Live\"\n type = \"http\"\n path = \"/metrics\"\n interval = \"10s\"\n timeout = \"2s\"\n }\n }\n\n # The \"resources\" stanza describes the requirements a task needs to\n # execute. Resource requirements include memory, network, cpu, and more.\n # This ensures the task will execute on a machine that contains enough\n # resource capacity.\n #\n # https://www.nomadproject.io/docs/job-specification/resources\n #\n resources {\n cpu = 500\n # The network stanza specifies the networking requirements for the task\n # group, including the network mode and port allocations. When scheduling\n # jobs in Nomad they are provisioned across your fleet of machines along\n # with other jobs and services. Because you don't know in advance what host\n # your job will be provisioned on, Nomad will provide your tasks with\n # network configuration when they start up.\n #\n # https://www.nomadproject.io/docs/job-specification/network\n #\n network {\n port \"${cadvisor_service_name}\" {\n static = ${cadvisor_port}\n }\n }\n }\n }\n }\n}",
+ "vars": {
+ "blackbox_port": "9115",
+ "blackbox_service_name": "blackboxexporter",
+ "blackbox_url_amd64": "https://github.com/prometheus/blackbox_exporter/releases/download/v0.18.0/blackbox_exporter-0.18.0.linux-amd64.tar.gz",
+ "blackbox_url_arm64": "https://github.com/prometheus/blackbox_exporter/releases/download/v0.18.0/blackbox_exporter-0.18.0.linux-arm64.tar.gz",
+ "blackbox_version": "0.18.0",
+ "cadvisor_image": "gcr.io/cadvisor/cadvisor:latest",
+ "cadvisor_port": "8080",
+ "cadvisor_service_name": "cadvisorexporter",
+ "datacenters": "yul1",
+ "job_name": "prod-exporter",
+ "node_port": "9100",
+ "node_service_name": "nodeexporter",
+ "node_url_amd64": "https://github.com/prometheus/node_exporter/releases/download/v1.0.1/node_exporter-1.0.1.linux-amd64.tar.gz",
+ "node_url_arm64": "https://github.com/prometheus/node_exporter/releases/download/v1.0.1/node_exporter-1.0.1.linux-arm64.tar.gz",
+ "node_version": "1.0.1",
+ "use_canary": "false"
+ }
+ },
+ "sensitive_attributes": []
+ }
+ ]
+ },
+ {
+ "module": "module.exporter",
+ "mode": "managed",
+ "type": "nomad_job",
+ "name": "nomad_job_exporter",
+ "provider": "provider[\"registry.terraform.io/hashicorp/nomad\"].yul1",
+ "instances": [
+ {
+ "schema_version": 0,
+ "attributes": {
+ "allocation_ids": [
+ "e11d14cf-0b12-7f11-f982-e75abb2fa245",
+ "be1d7792-8b38-0769-1899-4d78ac17d775",
+ "a382862b-e015-bb85-7fed-5a70ab5433e5",
+ "33190f8d-6b97-5a79-32c3-c2a48eb33cab",
+ "7b9a89f3-3094-73aa-de9e-b674cdf3962b",
+ "1b730294-e8c6-d330-72a6-7502ef6d8195",
+ "3b3932ce-d57e-d0c6-0d36-b32724d83a0f",
+ "d07855d6-29a8-dba5-92a8-bed5673c3b81",
+ "4ea7fa69-4819-5050-37ea-bbce995e994d",
+ "2c694f1a-fb0e-67b9-407a-dd5f1d72daa0",
+ "52e828d4-79a7-9e43-2cdb-1ba6088f3257",
+ "e23fc3eb-1699-e1e8-629b-82a21b72ecf4",
+ "6f83b8c0-4313-eb5a-9c22-e8db2226e832",
+ "15ee749c-07f0-2864-f907-919faa62cee2",
+ "12025e7c-4a27-9a9b-b0aa-a205179139eb",
+ "a6234b96-8c7b-1fd4-e85f-d361774d3005",
+ "fadf8b49-0e1c-98ea-42b5-119667fa092e",
+ "2cb8905c-3d36-878d-917e-493f8a982970",
+ "a1426c7b-a4a1-0f19-cc11-162b396e9ce9",
+ "f24eee7b-e480-3386-9a6e-357edb9ae137",
+ "7b7c32b2-9cc9-bb99-b203-2842e4495f5c",
+ "c90d6b3d-93ac-2fa6-e713-aa69b3a3be69"
+ ],
+ "datacenters": [
+ "yul1"
+ ],
+ "deployment_id": "",
+ "deployment_status": "",
+ "deregister_on_destroy": true,
+ "deregister_on_id_change": true,
+ "detach": false,
+ "id": "prod-exporter",
+ "jobspec": "job \"prod-exporter\" {\n # The \"region\" parameter specifies the region in which to execute the job.\n # If omitted, this inherits the default region name of \"global\".\n # region = \"global\"\n #\n # The \"datacenters\" parameter specifies the list of datacenters which should\n # be considered when placing this task. This must be provided.\n datacenters = \"yul1\"\n\n # The \"type\" parameter controls the type of job, which impacts the scheduler's\n # decision on placement. This configuration is optional and defaults to\n # \"service\". For a full list of job types and their differences, please see\n # the online documentation.\n #\n # https://www.nomadproject.io/docs/jobspec/schedulers\n #\n type = \"system\"\n\n update {\n # The \"max_parallel\" parameter specifies the maximum number of updates to\n # perform in parallel. In this case, this specifies to update a single task\n # at a time.\n max_parallel = 1\n\n health_check = \"checks\"\n\n # The \"min_healthy_time\" parameter specifies the minimum time the allocation\n # must be in the healthy state before it is marked as healthy and unblocks\n # further allocations from being updated.\n min_healthy_time = \"10s\"\n\n # The \"healthy_deadline\" parameter specifies the deadline in which the\n # allocation must be marked as healthy after which the allocation is\n # automatically transitioned to unhealthy. Transitioning to unhealthy will\n # fail the deployment and potentially roll back the job if \"auto_revert\" is\n # set to true.\n healthy_deadline = \"3m\"\n\n # The \"progress_deadline\" parameter specifies the deadline in which an\n # allocation must be marked as healthy. The deadline begins when the first\n # allocation for the deployment is created and is reset whenever an allocation\n # as part of the deployment transitions to a healthy state. If no allocation\n # transitions to the healthy state before the progress deadline, the\n # deployment is marked as failed.\n progress_deadline = \"10m\"\n\n\n }\n\n # The \"group\" stanza defines a series of tasks that should be co-located on\n # the same Nomad client. Any task within a group will be placed on the same\n # client.\n #\n # https://www.nomadproject.io/docs/job-specification/group\n #\n group \"prod-group1-exporter-amd64\" {\n # The constraint allows restricting the set of eligible nodes. Constraints\n # may filter on attributes or client metadata.\n #\n # https://www.nomadproject.io/docs/job-specification/constraint\n #\n constraint {\n attribute = \"${attr.cpu.arch}\"\n operator = \"!=\"\n value = \"arm64\"\n }\n\n # The \"task\" stanza creates an individual unit of work, such as a Docker\n # container, web application, or batch processing.\n #\n # https://www.nomadproject.io/docs/job-specification/task\n #\n task \"prod-task1-nodeexporter-amd64\" {\n # The \"driver\" parameter specifies the task driver that should be used to\n # run the task.\n driver = \"raw_exec\"\n\n # The \"config\" stanza specifies the driver configuration, which is passed\n # directly to the driver to start the task. The details of configurations\n # are specific to each driver, so please see specific driver\n # documentation for more information.\n config {\n command = \"local/node_exporter-1.0.1.linux-amd64/node_exporter\"\n }\n\n # The artifact stanza instructs Nomad to fetch and unpack a remote resource,\n # such as a file, tarball, or binary. Nomad downloads artifacts using the\n # popular go-getter library, which permits downloading artifacts from a\n # variety of locations using a URL as the input source.\n #\n # https://www.nomadproject.io/docs/job-specification/artifact\n #\n artifact {\n source = \"https://github.com/prometheus/node_exporter/releases/download/v1.0.1/node_exporter-1.0.1.linux-amd64.tar.gz\"\n }\n\n # The service stanza instructs Nomad to register a service with Consul.\n #\n # https://www.nomadproject.io/docs/job-specification/service\n #\n service {\n name = \"nodeexporter\"\n port = \"nodeexporter\"\n check {\n name = \"Node Exporter Check Live\"\n type = \"http\"\n path = \"/metrics\"\n interval = \"10s\"\n timeout = \"2s\"\n }\n }\n\n # The \"resources\" stanza describes the requirements a task needs to\n # execute. Resource requirements include memory, network, cpu, and more.\n # This ensures the task will execute on a machine that contains enough\n # resource capacity.\n #\n # https://www.nomadproject.io/docs/job-specification/resources\n #\n resources {\n cpu = 500\n # The network stanza specifies the networking requirements for the task\n # group, including the network mode and port allocations. When scheduling\n # jobs in Nomad they are provisioned across your fleet of machines along\n # with other jobs and services. Because you don't know in advance what host\n # your job will be provisioned on, Nomad will provide your tasks with\n # network configuration when they start up.\n #\n # https://www.nomadproject.io/docs/job-specification/network\n #\n network {\n port \"nodeexporter\" {\n static = 9100\n }\n }\n }\n }\n task \"prod-task2-blackboxexporter-amd64\" {\n # The \"driver\" parameter specifies the task driver that should be used to\n # run the task.\n driver = \"exec\"\n\n # The \"config\" stanza specifies the driver configuration, which is passed\n # directly to the driver to start the task. The details of configurations\n # are specific to each driver, so please see specific driver\n # documentation for more information.\n config {\n command = \"local/blackbox_exporter-0.18.0.linux-amd64/blackbox_exporter\"\n args = [\n \"--config.file=secrets/blackbox.yml\"\n ]\n }\n\n # The \"template\" stanza instructs Nomad to manage a template, such as\n # a configuration file or script. This template can optionally pull data\n # from Consul or Vault to populate runtime configuration data.\n #\n # https://www.nomadproject.io/docs/job-specification/template\n #\n template {\n change_mode = \"noop\"\n change_signal = \"SIGINT\"\n destination = \"secrets/blackbox.yml\"\n data = \u003c\u003cEOH\nmodules:\n http_2xx:\n prober: http\n timeout: 5s\n http:\n valid_http_versions: [\"HTTP/1.1\", \"HTTP/2.0\"]\n no_follow_redirects: false\n fail_if_ssl: false\n fail_if_not_ssl: true\n tls_config:\n insecure_skip_verify: false\n preferred_ip_protocol: \"ip4\"\n icmp_v4:\n prober: icmp\n timeout: 5s\n icmp:\n preferred_ip_protocol: \"ip4\"\n dns_udp:\n prober: dns\n timeout: 5s\n dns:\n query_name: \"jenkins.fd.io\"\n query_type: \"A\"\n valid_rcodes:\n - NOERROR\nEOH\n }\n\n # The artifact stanza instructs Nomad to fetch and unpack a remote resource,\n # such as a file, tarball, or binary. Nomad downloads artifacts using the\n # popular go-getter library, which permits downloading artifacts from a\n # variety of locations using a URL as the input source.\n #\n # https://www.nomadproject.io/docs/job-specification/artifact\n #\n artifact {\n source = \"https://github.com/prometheus/blackbox_exporter/releases/download/v0.18.0/blackbox_exporter-0.18.0.linux-amd64.tar.gz\"\n }\n\n # The service stanza instructs Nomad to register a service with Consul.\n #\n # https://www.nomadproject.io/docs/job-specification/service\n #\n service {\n name = \"blackboxexporter\"\n port = \"blackboxexporter\"\n tags = [ \"blackboxexporter${NOMAD_ALLOC_INDEX}\" ]\n check {\n name = \"Blackbox Exporter Check Live\"\n type = \"http\"\n path = \"/metrics\"\n interval = \"10s\"\n timeout = \"2s\"\n }\n }\n\n # The \"resources\" stanza describes the requirements a task needs to\n # execute. Resource requirements include memory, network, cpu, and more.\n # This ensures the task will execute on a machine that contains enough\n # resource capacity.\n #\n # https://www.nomadproject.io/docs/job-specification/resources\n #\n resources {\n cpu = 500\n # The network stanza specifies the networking requirements for the task\n # group, including the network mode and port allocations. When scheduling\n # jobs in Nomad they are provisioned across your fleet of machines along\n # with other jobs and services. Because you don't know in advance what host\n # your job will be provisioned on, Nomad will provide your tasks with\n # network configuration when they start up.\n #\n # https://www.nomadproject.io/docs/job-specification/network\n #\n network {\n port \"blackboxexporter\" {\n static = 9115\n }\n }\n }\n }\n\n task \"prod-task3-cadvisorexporter-amd64\" {\n # The \"driver\" parameter specifies the task driver that should be used to\n # run the task.\n driver = \"docker\"\n\n # The \"config\" stanza specifies the driver configuration, which is passed\n # directly to the driver to start the task. The details of configurations\n # are specific to each driver, so please see specific driver\n # documentation for more information.\n config {\n image = \"gcr.io/cadvisor/cadvisor:latest\"\n volumes = [\n \"/:/rootfs:ro\",\n \"/var/run:/var/run:rw\",\n \"/sys:/sys:ro\",\n \"/var/lib/docker/:/var/lib/docker:ro\",\n \"/cgroup:/cgroup:ro\"\n ]\n }\n\n # The service stanza instructs Nomad to register a service with Consul.\n #\n # https://www.nomadproject.io/docs/job-specification/service\n #\n service {\n name = \"cadvisorexporter\"\n port = \"cadvisorexporter\"\n check {\n name = \"cAdvisor Check Live\"\n type = \"http\"\n path = \"/metrics\"\n interval = \"10s\"\n timeout = \"2s\"\n }\n }\n\n # The \"resources\" stanza describes the requirements a task needs to\n # execute. Resource requirements include memory, network, cpu, and more.\n # This ensures the task will execute on a machine that contains enough\n # resource capacity.\n #\n # https://www.nomadproject.io/docs/job-specification/resources\n #\n resources {\n cpu = 500\n # The network stanza specifies the networking requirements for the task\n # group, including the network mode and port allocations. When scheduling\n # jobs in Nomad they are provisioned across your fleet of machines along\n # with other jobs and services. Because you don't know in advance what host\n # your job will be provisioned on, Nomad will provide your tasks with\n # network configuration when they start up.\n #\n # https://www.nomadproject.io/docs/job-specification/network\n #\n network {\n port \"cadvisorexporter\" {\n static = 8080\n }\n }\n }\n }\n }\n\n group \"prod-group1-exporter-arm64\" {\n # The constraint allows restricting the set of eligible nodes. Constraints\n # may filter on attributes or client metadata.\n #\n # https://www.nomadproject.io/docs/job-specification/constraint\n #\n constraint {\n attribute = \"${attr.cpu.arch}\"\n operator = \"==\"\n value = \"arm64\"\n }\n\n # The \"task\" stanza creates an individual unit of work, such as a Docker\n # container, web application, or batch processing.\n #\n # https://www.nomadproject.io/docs/job-specification/task\n #\n task \"prod-task1-nodeexporter-arm64\" {\n # The \"driver\" parameter specifies the task driver that should be used to\n # run the task.\n driver = \"raw_exec\"\n\n # The \"config\" stanza specifies the driver configuration, which is passed\n # directly to the driver to start the task. The details of configurations\n # are specific to each driver, so please see specific driver\n # documentation for more information.\n config {\n command = \"local/node_exporter-1.0.1.linux-arm64/node_exporter\"\n }\n\n # The artifact stanza instructs Nomad to fetch and unpack a remote resource,\n # such as a file, tarball, or binary. Nomad downloads artifacts using the\n # popular go-getter library, which permits downloading artifacts from a\n # variety of locations using a URL as the input source.\n #\n # https://www.nomadproject.io/docs/job-specification/artifact\n #\n artifact {\n source = \"https://github.com/prometheus/node_exporter/releases/download/v1.0.1/node_exporter-1.0.1.linux-arm64.tar.gz\"\n }\n\n # The service stanza instructs Nomad to register a service with Consul.\n #\n # https://www.nomadproject.io/docs/job-specification/service\n #\n service {\n name = \"nodeexporter\"\n port = \"nodeexporter\"\n check {\n name = \"Node Exporter Check Live\"\n type = \"http\"\n path = \"/metrics\"\n interval = \"10s\"\n timeout = \"2s\"\n }\n }\n\n # The \"resources\" stanza describes the requirements a task needs to\n # execute. Resource requirements include memory, network, cpu, and more.\n # This ensures the task will execute on a machine that contains enough\n # resource capacity.\n #\n # https://www.nomadproject.io/docs/job-specification/resources\n #\n resources {\n cpu = 500\n # The network stanza specifies the networking requirements for the task\n # group, including the network mode and port allocations. When scheduling\n # jobs in Nomad they are provisioned across your fleet of machines along\n # with other jobs and services. Because you don't know in advance what host\n # your job will be provisioned on, Nomad will provide your tasks with\n # network configuration when they start up.\n #\n # https://www.nomadproject.io/docs/job-specification/network\n #\n network {\n port \"nodeexporter\" {\n static = 9100\n }\n }\n }\n }\n\n task \"prod-task2-blackboxexporter-arm64\" {\n # The \"driver\" parameter specifies the task driver that should be used to\n # run the task.\n driver = \"exec\"\n\n # The \"config\" stanza specifies the driver configuration, which is passed\n # directly to the driver to start the task. The details of configurations\n # are specific to each driver, so please see specific driver\n # documentation for more information.\n config {\n command = \"local/blackbox_exporter-0.18.0.linux-arm64/blackbox_exporter\"\n args = [\n \"--config.file=secrets/blackbox.yml\"\n ]\n }\n\n # The \"template\" stanza instructs Nomad to manage a template, such as\n # a configuration file or script. This template can optionally pull data\n # from Consul or Vault to populate runtime configuration data.\n #\n # https://www.nomadproject.io/docs/job-specification/template\n #\n template {\n change_mode = \"noop\"\n change_signal = \"SIGINT\"\n destination = \"secrets/blackbox.yml\"\n data = \u003c\u003cEOH\nmodules:\n http_2xx:\n prober: http\n timeout: 5s\n http:\n valid_http_versions: [\"HTTP/1.1\", \"HTTP/2.0\"]\n no_follow_redirects: false\n fail_if_ssl: false\n fail_if_not_ssl: true\n tls_config:\n insecure_skip_verify: false\n preferred_ip_protocol: \"ip4\"\n icmp_v4:\n prober: icmp\n timeout: 5s\n icmp:\n preferred_ip_protocol: \"ip4\"\n dns_udp:\n prober: dns\n timeout: 5s\n dns:\n query_name: \"jenkins.fd.io\"\n query_type: \"A\"\n valid_rcodes:\n - NOERROR\nEOH\n }\n\n # The artifact stanza instructs Nomad to fetch and unpack a remote resource,\n # such as a file, tarball, or binary. Nomad downloads artifacts using the\n # popular go-getter library, which permits downloading artifacts from a\n # variety of locations using a URL as the input source.\n #\n # https://www.nomadproject.io/docs/job-specification/artifact\n #\n artifact {\n source = \"https://github.com/prometheus/blackbox_exporter/releases/download/v0.18.0/blackbox_exporter-0.18.0.linux-arm64.tar.gz\"\n }\n\n # The service stanza instructs Nomad to register a service with Consul.\n #\n # https://www.nomadproject.io/docs/job-specification/service\n #\n service {\n name = \"blackboxexporter\"\n port = \"blackboxexporter\"\n tags = [ \"blackboxexporter${NOMAD_ALLOC_INDEX}\" ]\n check {\n name = \"Blackbox Exporter Check Live\"\n type = \"http\"\n path = \"/metrics\"\n interval = \"10s\"\n timeout = \"2s\"\n }\n }\n\n # The \"resources\" stanza describes the requirements a task needs to\n # execute. Resource requirements include memory, network, cpu, and more.\n # This ensures the task will execute on a machine that contains enough\n # resource capacity.\n #\n # https://www.nomadproject.io/docs/job-specification/resources\n #\n resources {\n cpu = 500\n # The network stanza specifies the networking requirements for the task\n # group, including the network mode and port allocations. When scheduling\n # jobs in Nomad they are provisioned across your fleet of machines along\n # with other jobs and services. Because you don't know in advance what host\n # your job will be provisioned on, Nomad will provide your tasks with\n # network configuration when they start up.\n #\n # https://www.nomadproject.io/docs/job-specification/network\n #\n network {\n port \"blackboxexporter\" {\n static = 9115\n }\n }\n }\n }\n\n task \"prod-task3-cadvisorexporter-arm64\" {\n # The \"driver\" parameter specifies the task driver that should be used to\n # run the task.\n driver = \"docker\"\n\n # The \"config\" stanza specifies the driver configuration, which is passed\n # directly to the driver to start the task. The details of configurations\n # are specific to each driver, so please see specific driver\n # documentation for more information.\n config {\n # There is currently no official release for arm yet...using community.\n image = \"zcube/cadvisor:latest\"\n volumes = [\n \"/:/rootfs:ro\",\n \"/var/run:/var/run:rw\",\n \"/sys:/sys:ro\",\n \"/var/lib/docker/:/var/lib/docker:ro\",\n \"/cgroup:/cgroup:ro\"\n ]\n }\n\n # The service stanza instructs Nomad to register a service with Consul.\n #\n # https://www.nomadproject.io/docs/job-specification/service\n #\n service {\n name = \"cadvisorexporter\"\n port = \"cadvisorexporter\"\n check {\n name = \"cAdvisor Check Live\"\n type = \"http\"\n path = \"/metrics\"\n interval = \"10s\"\n timeout = \"2s\"\n }\n }\n\n # The \"resources\" stanza describes the requirements a task needs to\n # execute. Resource requirements include memory, network, cpu, and more.\n # This ensures the task will execute on a machine that contains enough\n # resource capacity.\n #\n # https://www.nomadproject.io/docs/job-specification/resources\n #\n resources {\n cpu = 500\n # The network stanza specifies the networking requirements for the task\n # group, including the network mode and port allocations. When scheduling\n # jobs in Nomad they are provisioned across your fleet of machines along\n # with other jobs and services. Because you don't know in advance what host\n # your job will be provisioned on, Nomad will provide your tasks with\n # network configuration when they start up.\n #\n # https://www.nomadproject.io/docs/job-specification/network\n #\n network {\n port \"cadvisorexporter\" {\n static = 8080\n }\n }\n }\n }\n }\n}",
+ "json": null,
+ "modify_index": "7147927",
+ "name": "prod-exporter",
+ "namespace": "default",
+ "policy_override": null,
+ "purge_on_destroy": null,
+ "region": "global",
+ "task_groups": [
+ {
+ "count": 1,
+ "meta": {},
+ "name": "prod-group1-exporter-amd64",
+ "task": [
+ {
+ "driver": "raw_exec",
+ "meta": {},
+ "name": "prod-task1-nodeexporter-amd64",
+ "volume_mounts": []
+ },
+ {
+ "driver": "exec",
+ "meta": {},
+ "name": "prod-task2-blackboxexporter-amd64",
+ "volume_mounts": []
+ },
+ {
+ "driver": "docker",
+ "meta": {},
+ "name": "prod-task3-cadvisorexporter-amd64",
+ "volume_mounts": []
+ }
+ ],
+ "volumes": []
+ },
+ {
+ "count": 1,
+ "meta": {},
+ "name": "prod-group1-exporter-arm64",
+ "task": [
+ {
+ "driver": "raw_exec",
+ "meta": {},
+ "name": "prod-task1-nodeexporter-arm64",
+ "volume_mounts": []
+ },
+ {
+ "driver": "exec",
+ "meta": {},
+ "name": "prod-task2-blackboxexporter-arm64",
+ "volume_mounts": []
+ },
+ {
+ "driver": "docker",
+ "meta": {},
+ "name": "prod-task3-cadvisorexporter-arm64",
+ "volume_mounts": []
+ }
+ ],
+ "volumes": []
+ }
+ ],
+ "type": "system"
+ },
+ "sensitive_attributes": [],
+ "private": "bnVsbA==",
+ "dependencies": [
+ "module.exporter.data.template_file.nomad_job_exporter"
+ ]
+ }
+ ]
+ },
+ {
+ "module": "module.grafana",
+ "mode": "data",
+ "type": "template_file",
+ "name": "nomad_job_grafana",
+ "provider": "provider[\"registry.terraform.io/hashicorp/template\"]",
+ "instances": [
+ {
+ "schema_version": 0,
+ "attributes": {
+ "filename": null,
+ "id": "039453e436ebcc20c398c58e986fd3cbc8a797cf28b1006ce0407eba9bb0f9cf",
+ "rendered": "job \"prod-grafana\" {\n # The \"region\" parameter specifies the region in which to execute the job.\n # If omitted, this inherits the default region name of \"global\".\n # region = \"global\"\n #\n # The \"datacenters\" parameter specifies the list of datacenters which should\n # be considered when placing this task. This must be provided.\n datacenters = \"yul1\"\n\n # The \"type\" parameter controls the type of job, which impacts the scheduler's\n # decision on placement. This configuration is optional and defaults to\n # \"service\". For a full list of job types and their differences, please see\n # the online documentation.\n #\n # For more information, please see the online documentation at:\n #\n # https://www.nomadproject.io/docs/jobspec/schedulers\n #\n type = \"service\"\n\n update {\n # The \"max_parallel\" parameter specifies the maximum number of updates to\n # perform in parallel. In this case, this specifies to update a single task\n # at a time.\n max_parallel = 1\n\n health_check = \"checks\"\n\n # The \"min_healthy_time\" parameter specifies the minimum time the allocation\n # must be in the healthy state before it is marked as healthy and unblocks\n # further allocations from being updated.\n min_healthy_time = \"10s\"\n\n # The \"healthy_deadline\" parameter specifies the deadline in which the\n # allocation must be marked as healthy after which the allocation is\n # automatically transitioned to unhealthy. Transitioning to unhealthy will\n # fail the deployment and potentially roll back the job if \"auto_revert\" is\n # set to true.\n healthy_deadline = \"3m\"\n\n # The \"progress_deadline\" parameter specifies the deadline in which an\n # allocation must be marked as healthy. The deadline begins when the first\n # allocation for the deployment is created and is reset whenever an allocation\n # as part of the deployment transitions to a healthy state. If no allocation\n # transitions to the healthy state before the progress deadline, the\n # deployment is marked as failed.\n progress_deadline = \"10m\"\n\n\n # The \"canary\" parameter specifies that changes to the job that would result\n # in destructive updates should create the specified number of canaries\n # without stopping any previous allocations. Once the operator determines the\n # canaries are healthy, they can be promoted which unblocks a rolling update\n # of the remaining allocations at a rate of \"max_parallel\".\n #\n # Further, setting \"canary\" equal to the count of the task group allows\n # blue/green deployments. When the job is updated, a full set of the new\n # version is deployed and upon promotion the old version is stopped.\n canary = 1\n\n # Specifies if the job should auto-promote to the canary version when all\n # canaries become healthy during a deployment. Defaults to false which means\n # canaries must be manually updated with the nomad deployment promote\n # command.\n auto_promote = true\n\n # The \"auto_revert\" parameter specifies if the job should auto-revert to the\n # last stable job on deployment failure. A job is marked as stable if all the\n # allocations as part of its deployment were marked healthy.\n auto_revert = true\n\n }\n\n # The \"group\" stanza defines a series of tasks that should be co-located on\n # the same Nomad client. Any task within a group will be placed on the same\n # client.\n #\n # For more information and examples on the \"group\" stanza, please see\n # the online documentation at:\n #\n # https://www.nomadproject.io/docs/job-specification/group\n #\n group \"prod-group1-grafana\" {\n # The \"count\" parameter specifies the number of the task groups that should\n # be running under this group. This value must be non-negative and defaults\n # to 1.\n count = 1\n\n\n # The constraint allows restricting the set of eligible nodes. Constraints\n # may filter on attributes or client metadata.\n #\n # For more information and examples on the \"volume\" stanza, please see\n # the online documentation at:\n #\n # https://www.nomadproject.io/docs/job-specification/constraint\n #\n constraint {\n attribute = \"${attr.cpu.arch}\"\n operator = \"!=\"\n value = \"arm64\"\n }\n\n # The \"task\" stanza creates an individual unit of work, such as a Docker\n # container, web application, or batch processing.\n #\n # For more information and examples on the \"task\" stanza, please see\n # the online documentation at:\n #\n # https://www.nomadproject.io/docs/job-specification/task\n #\n task \"prod-task1-grafana\" {\n # The \"driver\" parameter specifies the task driver that should be used to\n # run the task.\n driver = \"docker\"\n\n # The \"config\" stanza specifies the driver configuration, which is passed\n # directly to the driver to start the task. The details of configurations\n # are specific to each driver, so please see specific driver\n # documentation for more information.\n config {\n image = \"grafana/grafana:7.3.7\"\n dns_servers = [ \"${attr.unique.network.ip-address}\" ]\n volumes = [\n \"secrets/prometheus.yml:/etc/grafana/provisioning/datasources/prometheus.yml\",\n \"secrets/dashboards.yml:/etc/grafana/provisioning/dashboards/dashboards.yml\",\n \"secrets/grafana.ini:/etc/grafana/grafana.ini\",\n \"secrets/node_exporter.json:/etc/grafana/provisioning/dashboards/node_exporter.json\",\n \"secrets/docker_cadvisor.json:/etc/grafana/provisioning/dashboards/docker_cadvisor.json\",\n \"secrets/nomad.json:/etc/grafana/provisioning/dashboards/nomad.json\",\n \"secrets/consul.json:/etc/grafana/provisioning/dashboards/consul.json\",\n \"secrets/prometheus.json:/etc/grafana/provisioning/dashboards/prometheus.json\"\n ]\n }\n\n artifact {\n # Node Exporter\n source = \"https://raw.githubusercontent.com/pmikus/grafana-dashboards/main/node_exporter.json\"\n destination = \"secrets/\"\n }\n\n artifact {\n # Docker cAdvisor\n source = \"https://raw.githubusercontent.com/pmikus/grafana-dashboards/main/docker_cadvisor.json\"\n destination = \"secrets/\"\n }\n\n artifact {\n # Nomad\n source = \"https://raw.githubusercontent.com/pmikus/grafana-dashboards/main/nomad.json\"\n destination = \"secrets/\"\n }\n\n artifact {\n # Consul\n source = \"https://raw.githubusercontent.com/pmikus/grafana-dashboards/main/consul.json\"\n destination = \"secrets/\"\n }\n\n artifact {\n # Prometheus\n source = \"https://raw.githubusercontent.com/pmikus/grafana-dashboards/main/prometheus.json\"\n destination = \"secrets/\"\n }\n\n # The \"template\" stanza instructs Nomad to manage a template, such as\n # a configuration file or script. This template can optionally pull data\n # from Consul or Vault to populate runtime configuration data.\n #\n # For more information and examples on the \"template\" stanza, please see\n # the online documentation at:\n #\n # https://www.nomadproject.io/docs/job-specification/template\n #\n template {\n change_mode = \"noop\"\n change_signal = \"SIGINT\"\n destination = \"secrets/prometheus.yml\"\n data = \u003c\u003cEOH\napiVersion: 1\ndatasources:\n- name: Prometheus\n type: prometheus\n access: direct\n orgId: 1\n url: http://prometheus.service.consul:9090\n basicAuth: false\n isDefault: true\n version: 1\n editable: false\nEOH\n }\n\n template {\n change_mode = \"noop\"\n change_signal = \"SIGINT\"\n destination = \"secrets/dashboards.yml\"\n data = \u003c\u003cEOH\napiVersion: 1\nproviders:\n- name: dashboards\n type: file\n disableDeletion: false\n updateIntervalSeconds: 10\n allowUiUpdates: false\n options:\n path: /etc/grafana/provisioning/dashboards\n foldersFromFilesStructure: true\nEOH\n }\n\n template {\n change_mode = \"noop\"\n change_signal = \"SIGINT\"\n destination = \"secrets/grafana.ini\"\n data = \u003c\u003cEOH\napp_mode = production\n\n[metrics]\nenabled = true\n\n[server]\nprotocol = http\nhttp_port = 3000\nroot_url = http://grafana.service.consul:3000\nenable_gzip = true\n;cert_file =\n;cert_key =\n\n[security]\nadmin_user = grafanauser\nadmin_password = Grafana1234\nsecret_key = SW2YcwTIb9zpOOhoPsMm\n\n[users]\nallow_sign_up = false\nallow_org_create = false\nauto_assign_org = true\nauto_assign_org_role = Viewer\ndefault_theme = dark\n\n[auth.basic]\nenabled = true\n\n[auth]\ndisable_login_form = false\ndisable_signout_menu = false\n\n[auth.anonymous]\nenabled = false\n\n[log]\nmode = console\nlevel = info\n\n[log.console]\nlevel = info\nformat = console\nEOH\n }\n\n # The service stanza instructs Nomad to register a service with Consul.\n #\n # For more information and examples on the \"task\" stanza, please see\n # the online documentation at:\n #\n # https://www.nomadproject.io/docs/job-specification/service\n #\n service {\n name = \"grafana\"\n port = \"grafana\"\n tags = [ \"grafana${NOMAD_ALLOC_INDEX}\" ]\n check {\n name = \"Grafana Check Live\"\n type = \"http\"\n protocol = \"http\"\n tls_skip_verify = true\n path = \"/api/health\"\n interval = \"10s\"\n timeout = \"2s\"\n }\n }\n\n # The \"resources\" stanza describes the requirements a task needs to\n # execute. Resource requirements include memory, network, cpu, and more.\n # This ensures the task will execute on a machine that contains enough\n # resource capacity.\n #\n # For more information and examples on the \"resources\" stanza, please see\n # the online documentation at:\n #\n # https://www.nomadproject.io/docs/job-specification/resources\n #\n resources {\n cpu = 2000\n memory = 2048\n # The network stanza specifies the networking requirements for the task\n # group, including the network mode and port allocations. When scheduling\n # jobs in Nomad they are provisioned across your fleet of machines along\n # with other jobs and services. Because you don't know in advance what host\n # your job will be provisioned on, Nomad will provide your tasks with\n # network configuration when they start up.\n #\n # For more information and examples on the \"template\" stanza, please see\n # the online documentation at:\n #\n # https://www.nomadproject.io/docs/job-specification/network\n #\n network {\n port \"grafana\" {\n static = 3000\n }\n }\n }\n }\n }\n}",
+ "template": "job \"${job_name}\" {\n # The \"region\" parameter specifies the region in which to execute the job.\n # If omitted, this inherits the default region name of \"global\".\n # region = \"global\"\n #\n # The \"datacenters\" parameter specifies the list of datacenters which should\n # be considered when placing this task. This must be provided.\n datacenters = \"${datacenters}\"\n\n # The \"type\" parameter controls the type of job, which impacts the scheduler's\n # decision on placement. This configuration is optional and defaults to\n # \"service\". For a full list of job types and their differences, please see\n # the online documentation.\n #\n # For more information, please see the online documentation at:\n #\n # https://www.nomadproject.io/docs/jobspec/schedulers\n #\n type = \"service\"\n\n update {\n # The \"max_parallel\" parameter specifies the maximum number of updates to\n # perform in parallel. In this case, this specifies to update a single task\n # at a time.\n max_parallel = 1\n\n health_check = \"checks\"\n\n # The \"min_healthy_time\" parameter specifies the minimum time the allocation\n # must be in the healthy state before it is marked as healthy and unblocks\n # further allocations from being updated.\n min_healthy_time = \"10s\"\n\n # The \"healthy_deadline\" parameter specifies the deadline in which the\n # allocation must be marked as healthy after which the allocation is\n # automatically transitioned to unhealthy. Transitioning to unhealthy will\n # fail the deployment and potentially roll back the job if \"auto_revert\" is\n # set to true.\n healthy_deadline = \"3m\"\n\n # The \"progress_deadline\" parameter specifies the deadline in which an\n # allocation must be marked as healthy. The deadline begins when the first\n # allocation for the deployment is created and is reset whenever an allocation\n # as part of the deployment transitions to a healthy state. If no allocation\n # transitions to the healthy state before the progress deadline, the\n # deployment is marked as failed.\n progress_deadline = \"10m\"\n\n%{ if use_canary }\n # The \"canary\" parameter specifies that changes to the job that would result\n # in destructive updates should create the specified number of canaries\n # without stopping any previous allocations. Once the operator determines the\n # canaries are healthy, they can be promoted which unblocks a rolling update\n # of the remaining allocations at a rate of \"max_parallel\".\n #\n # Further, setting \"canary\" equal to the count of the task group allows\n # blue/green deployments. When the job is updated, a full set of the new\n # version is deployed and upon promotion the old version is stopped.\n canary = 1\n\n # Specifies if the job should auto-promote to the canary version when all\n # canaries become healthy during a deployment. Defaults to false which means\n # canaries must be manually updated with the nomad deployment promote\n # command.\n auto_promote = true\n\n # The \"auto_revert\" parameter specifies if the job should auto-revert to the\n # last stable job on deployment failure. A job is marked as stable if all the\n # allocations as part of its deployment were marked healthy.\n auto_revert = true\n%{ endif }\n }\n\n # The \"group\" stanza defines a series of tasks that should be co-located on\n # the same Nomad client. Any task within a group will be placed on the same\n # client.\n #\n # For more information and examples on the \"group\" stanza, please see\n # the online documentation at:\n #\n # https://www.nomadproject.io/docs/job-specification/group\n #\n group \"prod-group1-${service_name}\" {\n # The \"count\" parameter specifies the number of the task groups that should\n # be running under this group. This value must be non-negative and defaults\n # to 1.\n count = ${group_count}\n\n\n # The constraint allows restricting the set of eligible nodes. Constraints\n # may filter on attributes or client metadata.\n #\n # For more information and examples on the \"volume\" stanza, please see\n # the online documentation at:\n #\n # https://www.nomadproject.io/docs/job-specification/constraint\n #\n constraint {\n attribute = \"$${attr.cpu.arch}\"\n operator = \"!=\"\n value = \"arm64\"\n }\n\n # The \"task\" stanza creates an individual unit of work, such as a Docker\n # container, web application, or batch processing.\n #\n # For more information and examples on the \"task\" stanza, please see\n # the online documentation at:\n #\n # https://www.nomadproject.io/docs/job-specification/task\n #\n task \"prod-task1-${service_name}\" {\n # The \"driver\" parameter specifies the task driver that should be used to\n # run the task.\n driver = \"docker\"\n\n # The \"config\" stanza specifies the driver configuration, which is passed\n # directly to the driver to start the task. The details of configurations\n # are specific to each driver, so please see specific driver\n # documentation for more information.\n config {\n image = \"${image}\"\n dns_servers = [ \"$${attr.unique.network.ip-address}\" ]\n volumes = [\n \"secrets/prometheus.yml:/etc/grafana/provisioning/datasources/prometheus.yml\",\n \"secrets/dashboards.yml:/etc/grafana/provisioning/dashboards/dashboards.yml\",\n \"secrets/grafana.ini:/etc/grafana/grafana.ini\",\n \"secrets/node_exporter.json:/etc/grafana/provisioning/dashboards/node_exporter.json\",\n \"secrets/docker_cadvisor.json:/etc/grafana/provisioning/dashboards/docker_cadvisor.json\",\n \"secrets/nomad.json:/etc/grafana/provisioning/dashboards/nomad.json\",\n \"secrets/consul.json:/etc/grafana/provisioning/dashboards/consul.json\",\n \"secrets/prometheus.json:/etc/grafana/provisioning/dashboards/prometheus.json\"\n ]\n }\n\n artifact {\n # Node Exporter\n source = \"https://raw.githubusercontent.com/pmikus/grafana-dashboards/main/node_exporter.json\"\n destination = \"secrets/\"\n }\n\n artifact {\n # Docker cAdvisor\n source = \"https://raw.githubusercontent.com/pmikus/grafana-dashboards/main/docker_cadvisor.json\"\n destination = \"secrets/\"\n }\n\n artifact {\n # Nomad\n source = \"https://raw.githubusercontent.com/pmikus/grafana-dashboards/main/nomad.json\"\n destination = \"secrets/\"\n }\n\n artifact {\n # Consul\n source = \"https://raw.githubusercontent.com/pmikus/grafana-dashboards/main/consul.json\"\n destination = \"secrets/\"\n }\n\n artifact {\n # Prometheus\n source = \"https://raw.githubusercontent.com/pmikus/grafana-dashboards/main/prometheus.json\"\n destination = \"secrets/\"\n }\n\n # The \"template\" stanza instructs Nomad to manage a template, such as\n # a configuration file or script. This template can optionally pull data\n # from Consul or Vault to populate runtime configuration data.\n #\n # For more information and examples on the \"template\" stanza, please see\n # the online documentation at:\n #\n # https://www.nomadproject.io/docs/job-specification/template\n #\n template {\n change_mode = \"noop\"\n change_signal = \"SIGINT\"\n destination = \"secrets/prometheus.yml\"\n data = \u003c\u003cEOH\napiVersion: 1\ndatasources:\n- name: Prometheus\n type: prometheus\n access: direct\n orgId: 1\n url: http://prometheus.service.consul:9090\n basicAuth: false\n isDefault: true\n version: 1\n editable: false\nEOH\n }\n\n template {\n change_mode = \"noop\"\n change_signal = \"SIGINT\"\n destination = \"secrets/dashboards.yml\"\n data = \u003c\u003cEOH\napiVersion: 1\nproviders:\n- name: dashboards\n type: file\n disableDeletion: false\n updateIntervalSeconds: 10\n allowUiUpdates: false\n options:\n path: /etc/grafana/provisioning/dashboards\n foldersFromFilesStructure: true\nEOH\n }\n\n template {\n change_mode = \"noop\"\n change_signal = \"SIGINT\"\n destination = \"secrets/grafana.ini\"\n data = \u003c\u003cEOH\napp_mode = production\n\n[metrics]\nenabled = true\n\n[server]\nprotocol = http\nhttp_port = ${port}\nroot_url = http://${service_name}.service.consul:${port}\nenable_gzip = true\n;cert_file =\n;cert_key =\n\n[security]\nadmin_user = grafanauser\nadmin_password = Grafana1234\nsecret_key = SW2YcwTIb9zpOOhoPsMm\n\n[users]\nallow_sign_up = false\nallow_org_create = false\nauto_assign_org = true\nauto_assign_org_role = Viewer\ndefault_theme = dark\n\n[auth.basic]\nenabled = true\n\n[auth]\ndisable_login_form = false\ndisable_signout_menu = false\n\n[auth.anonymous]\nenabled = false\n\n[log]\nmode = console\nlevel = info\n\n[log.console]\nlevel = info\nformat = console\nEOH\n }\n\n # The service stanza instructs Nomad to register a service with Consul.\n #\n # For more information and examples on the \"task\" stanza, please see\n # the online documentation at:\n #\n # https://www.nomadproject.io/docs/job-specification/service\n #\n service {\n name = \"${service_name}\"\n port = \"${service_name}\"\n tags = [ \"${service_name}$${NOMAD_ALLOC_INDEX}\" ]\n check {\n name = \"Grafana Check Live\"\n type = \"http\"\n protocol = \"http\"\n tls_skip_verify = true\n path = \"/api/health\"\n interval = \"10s\"\n timeout = \"2s\"\n }\n }\n\n # The \"resources\" stanza describes the requirements a task needs to\n # execute. Resource requirements include memory, network, cpu, and more.\n # This ensures the task will execute on a machine that contains enough\n # resource capacity.\n #\n # For more information and examples on the \"resources\" stanza, please see\n # the online documentation at:\n #\n # https://www.nomadproject.io/docs/job-specification/resources\n #\n resources {\n cpu = ${cpu}\n memory = ${mem}\n # The network stanza specifies the networking requirements for the task\n # group, including the network mode and port allocations. When scheduling\n # jobs in Nomad they are provisioned across your fleet of machines along\n # with other jobs and services. Because you don't know in advance what host\n # your job will be provisioned on, Nomad will provide your tasks with\n # network configuration when they start up.\n #\n # For more information and examples on the \"template\" stanza, please see\n # the online documentation at:\n #\n # https://www.nomadproject.io/docs/job-specification/network\n #\n network {\n port \"${service_name}\" {\n static = ${port}\n }\n }\n }\n }\n }\n}",
+ "vars": {
+ "cpu": "2000",
+ "datacenters": "yul1",
+ "group_count": "1",
+ "image": "grafana/grafana:7.3.7",
+ "job_name": "prod-grafana",
+ "mem": "2048",
+ "port": "3000",
+ "service_name": "grafana",
+ "use_canary": "true",
+ "use_vault_provider": "false"
+ }
+ },
+ "sensitive_attributes": []
+ }
+ ]
+ },
+ {
+ "module": "module.grafana",
+ "mode": "managed",
+ "type": "nomad_job",
+ "name": "nomad_job_grafana",
+ "provider": "provider[\"registry.terraform.io/hashicorp/nomad\"].yul1",
+ "instances": [
+ {
+ "schema_version": 0,
+ "attributes": {
+ "allocation_ids": [
+ "d2ddca98-1bb8-680d-06f1-00fcf4887eca"
+ ],
+ "datacenters": [
+ "yul1"
+ ],
+ "deployment_id": "4ef2df3e-696c-5382-5eec-991411a36a90",
+ "deployment_status": "successful",
+ "deregister_on_destroy": true,
+ "deregister_on_id_change": true,
+ "detach": false,
+ "id": "prod-grafana",
+ "jobspec": "job \"prod-grafana\" {\n # The \"region\" parameter specifies the region in which to execute the job.\n # If omitted, this inherits the default region name of \"global\".\n # region = \"global\"\n #\n # The \"datacenters\" parameter specifies the list of datacenters which should\n # be considered when placing this task. This must be provided.\n datacenters = \"yul1\"\n\n # The \"type\" parameter controls the type of job, which impacts the scheduler's\n # decision on placement. This configuration is optional and defaults to\n # \"service\". For a full list of job types and their differences, please see\n # the online documentation.\n #\n # For more information, please see the online documentation at:\n #\n # https://www.nomadproject.io/docs/jobspec/schedulers\n #\n type = \"service\"\n\n update {\n # The \"max_parallel\" parameter specifies the maximum number of updates to\n # perform in parallel. In this case, this specifies to update a single task\n # at a time.\n max_parallel = 1\n\n health_check = \"checks\"\n\n # The \"min_healthy_time\" parameter specifies the minimum time the allocation\n # must be in the healthy state before it is marked as healthy and unblocks\n # further allocations from being updated.\n min_healthy_time = \"10s\"\n\n # The \"healthy_deadline\" parameter specifies the deadline in which the\n # allocation must be marked as healthy after which the allocation is\n # automatically transitioned to unhealthy. Transitioning to unhealthy will\n # fail the deployment and potentially roll back the job if \"auto_revert\" is\n # set to true.\n healthy_deadline = \"3m\"\n\n # The \"progress_deadline\" parameter specifies the deadline in which an\n # allocation must be marked as healthy. The deadline begins when the first\n # allocation for the deployment is created and is reset whenever an allocation\n # as part of the deployment transitions to a healthy state. If no allocation\n # transitions to the healthy state before the progress deadline, the\n # deployment is marked as failed.\n progress_deadline = \"10m\"\n\n\n # The \"canary\" parameter specifies that changes to the job that would result\n # in destructive updates should create the specified number of canaries\n # without stopping any previous allocations. Once the operator determines the\n # canaries are healthy, they can be promoted which unblocks a rolling update\n # of the remaining allocations at a rate of \"max_parallel\".\n #\n # Further, setting \"canary\" equal to the count of the task group allows\n # blue/green deployments. When the job is updated, a full set of the new\n # version is deployed and upon promotion the old version is stopped.\n canary = 1\n\n # Specifies if the job should auto-promote to the canary version when all\n # canaries become healthy during a deployment. Defaults to false which means\n # canaries must be manually updated with the nomad deployment promote\n # command.\n auto_promote = true\n\n # The \"auto_revert\" parameter specifies if the job should auto-revert to the\n # last stable job on deployment failure. A job is marked as stable if all the\n # allocations as part of its deployment were marked healthy.\n auto_revert = true\n\n }\n\n # The \"group\" stanza defines a series of tasks that should be co-located on\n # the same Nomad client. Any task within a group will be placed on the same\n # client.\n #\n # For more information and examples on the \"group\" stanza, please see\n # the online documentation at:\n #\n # https://www.nomadproject.io/docs/job-specification/group\n #\n group \"prod-group1-grafana\" {\n # The \"count\" parameter specifies the number of the task groups that should\n # be running under this group. This value must be non-negative and defaults\n # to 1.\n count = 1\n\n\n # The constraint allows restricting the set of eligible nodes. Constraints\n # may filter on attributes or client metadata.\n #\n # For more information and examples on the \"volume\" stanza, please see\n # the online documentation at:\n #\n # https://www.nomadproject.io/docs/job-specification/constraint\n #\n constraint {\n attribute = \"${attr.cpu.arch}\"\n operator = \"!=\"\n value = \"arm64\"\n }\n\n # The \"task\" stanza creates an individual unit of work, such as a Docker\n # container, web application, or batch processing.\n #\n # For more information and examples on the \"task\" stanza, please see\n # the online documentation at:\n #\n # https://www.nomadproject.io/docs/job-specification/task\n #\n task \"prod-task1-grafana\" {\n # The \"driver\" parameter specifies the task driver that should be used to\n # run the task.\n driver = \"docker\"\n\n # The \"config\" stanza specifies the driver configuration, which is passed\n # directly to the driver to start the task. The details of configurations\n # are specific to each driver, so please see specific driver\n # documentation for more information.\n config {\n image = \"grafana/grafana:7.3.7\"\n dns_servers = [ \"${attr.unique.network.ip-address}\" ]\n volumes = [\n \"secrets/prometheus.yml:/etc/grafana/provisioning/datasources/prometheus.yml\",\n \"secrets/dashboards.yml:/etc/grafana/provisioning/dashboards/dashboards.yml\",\n \"secrets/grafana.ini:/etc/grafana/grafana.ini\",\n \"secrets/node_exporter.json:/etc/grafana/provisioning/dashboards/node_exporter.json\",\n \"secrets/docker_cadvisor.json:/etc/grafana/provisioning/dashboards/docker_cadvisor.json\",\n \"secrets/nomad.json:/etc/grafana/provisioning/dashboards/nomad.json\",\n \"secrets/consul.json:/etc/grafana/provisioning/dashboards/consul.json\",\n \"secrets/prometheus.json:/etc/grafana/provisioning/dashboards/prometheus.json\"\n ]\n }\n\n artifact {\n # Prometheus Node Exporter Full\n source = \"https://raw.githubusercontent.com/pmikus/grafana-dashboards/main/node_exporter.json\"\n destination = \"secrets/\"\n }\n\n artifact {\n # Docker cAdvisor\n source = \"https://raw.githubusercontent.com/pmikus/grafana-dashboards/main/docker_cadvisor.json\"\n destination = \"secrets/\"\n }\n\n artifact {\n # Nomad\n source = \"https://raw.githubusercontent.com/pmikus/grafana-dashboards/main/nomad.json\"\n destination = \"secrets/\"\n }\n\n artifact {\n # Consul\n source = \"https://raw.githubusercontent.com/pmikus/grafana-dashboards/main/consul.json\"\n destination = \"secrets/\"\n }\n\n artifact {\n # Prometheus\n source = \"https://raw.githubusercontent.com/pmikus/grafana-dashboards/main/prometheus.json\"\n destination = \"secrets/\"\n }\n\n # The \"template\" stanza instructs Nomad to manage a template, such as\n # a configuration file or script. This template can optionally pull data\n # from Consul or Vault to populate runtime configuration data.\n #\n # For more information and examples on the \"template\" stanza, please see\n # the online documentation at:\n #\n # https://www.nomadproject.io/docs/job-specification/template\n #\n template {\n change_mode = \"noop\"\n change_signal = \"SIGINT\"\n destination = \"secrets/prometheus.yml\"\n data = \u003c\u003cEOH\napiVersion: 1\ndatasources:\n- name: Prometheus\n type: prometheus\n access: direct\n orgId: 1\n url: http://prometheus.service.consul:9090\n basicAuth: false\n isDefault: true\n version: 1\n editable: false\nEOH\n }\n\n template {\n change_mode = \"noop\"\n change_signal = \"SIGINT\"\n destination = \"secrets/dashboards.yml\"\n data = \u003c\u003cEOH\napiVersion: 1\nproviders:\n- name: dashboards\n type: file\n disableDeletion: false\n updateIntervalSeconds: 10\n allowUiUpdates: false\n options:\n path: /etc/grafana/provisioning/dashboards\n foldersFromFilesStructure: true\nEOH\n }\n\n template {\n change_mode = \"noop\"\n change_signal = \"SIGINT\"\n destination = \"secrets/grafana.ini\"\n data = \u003c\u003cEOH\napp_mode = production\n\n[metrics]\nenabled = true\n\n[server]\nprotocol = http\nhttp_port = 3000\nroot_url = http://grafana.service.consul:3000\nenable_gzip = true\n;cert_file =\n;cert_key =\n\n[security]\nadmin_user = grafanauser\nadmin_password = Grafana1234\nsecret_key = SW2YcwTIb9zpOOhoPsMm\n\n[users]\nallow_sign_up = false\nallow_org_create = false\nauto_assign_org = true\nauto_assign_org_role = Viewer\ndefault_theme = dark\n\n[auth.basic]\nenabled = true\n\n[auth]\ndisable_login_form = false\ndisable_signout_menu = false\n\n[auth.anonymous]\nenabled = false\n\n[log]\nmode = console\nlevel = info\n\n[log.console]\nlevel = info\nformat = console\nEOH\n }\n\n # The service stanza instructs Nomad to register a service with Consul.\n #\n # For more information and examples on the \"task\" stanza, please see\n # the online documentation at:\n #\n # https://www.nomadproject.io/docs/job-specification/service\n #\n service {\n name = \"grafana\"\n port = \"grafana\"\n tags = [ \"grafana${NOMAD_ALLOC_INDEX}\" ]\n check {\n name = \"Grafana Check Live\"\n type = \"http\"\n protocol = \"http\"\n tls_skip_verify = true\n path = \"/api/health\"\n interval = \"10s\"\n timeout = \"2s\"\n }\n }\n\n # The \"resources\" stanza describes the requirements a task needs to\n # execute. Resource requirements include memory, network, cpu, and more.\n # This ensures the task will execute on a machine that contains enough\n # resource capacity.\n #\n # For more information and examples on the \"resources\" stanza, please see\n # the online documentation at:\n #\n # https://www.nomadproject.io/docs/job-specification/resources\n #\n resources {\n cpu = 2000\n memory = 2048\n # The network stanza specifies the networking requirements for the task\n # group, including the network mode and port allocations. When scheduling\n # jobs in Nomad they are provisioned across your fleet of machines along\n # with other jobs and services. Because you don't know in advance what host\n # your job will be provisioned on, Nomad will provide your tasks with\n # network configuration when they start up.\n #\n # For more information and examples on the \"template\" stanza, please see\n # the online documentation at:\n #\n # https://www.nomadproject.io/docs/job-specification/network\n #\n network {\n port \"grafana\" {\n static = 3000\n }\n }\n }\n }\n }\n}",
+ "json": null,
+ "modify_index": "7140252",
+ "name": "prod-grafana",
+ "namespace": "default",
+ "policy_override": null,
+ "purge_on_destroy": null,
+ "region": "global",
+ "task_groups": [
+ {
+ "count": 1,
+ "meta": {},
+ "name": "prod-group1-grafana",
+ "task": [
+ {
+ "driver": "docker",
+ "meta": {},
+ "name": "prod-task1-grafana",
+ "volume_mounts": []
+ }
+ ],
+ "volumes": []
+ }
+ ],
+ "type": "service"
+ },
+ "sensitive_attributes": [],
+ "private": "bnVsbA==",
+ "dependencies": [
+ "module.grafana.data.template_file.nomad_job_grafana"
+ ]
+ }
+ ]
+ },
+ {
+ "module": "module.minio",
+ "mode": "data",
+ "type": "template_file",
+ "name": "nomad_job_mc",
+ "provider": "provider[\"registry.terraform.io/hashicorp/template\"]",
+ "instances": [
+ {
+ "schema_version": 0,
+ "attributes": {
+ "filename": null,
+ "id": "ca0c95bbe91c4ac393d5cd5bef8efb90fb5f176f266ba233542fd4338b51c6cc",
+ "rendered": "job \"prod-mc\" {\n # The \"region\" parameter specifies the region in which to execute the job.\n # If omitted, this inherits the default region name of \"global\".\n # region = \"global\"\n #\n # The \"datacenters\" parameter specifies the list of datacenters which should\n # be considered when placing this task. This must be provided.\n datacenters = \"yul1\"\n\n # The \"type\" parameter controls the type of job, which impacts the scheduler's\n # decision on placement. This configuration is optional and defaults to\n # \"service\". For a full list of job types and their differences, please see\n # the online documentation.\n #\n # For more information, please see the online documentation at:\n #\n # https://www.nomadproject.io/docs/jobspec/schedulers.html\n #\n type = \"batch\"\n\n # The \"group\" stanza defines a series of tasks that should be co-located on\n # the same Nomad client. Any task within a group will be placed on the same\n # client.\n #\n # For more information and examples on the \"group\" stanza, please see\n # the online documentation at:\n #\n # https://www.nomadproject.io/docs/job-specification/group.html\n #\n group \"prod-group1-mc\" {\n task \"prod-task1-create-buckets\" {\n # The \"driver\" parameter specifies the task driver that should be used to\n # run the task.\n driver = \"docker\"\n\n \n\n # The \"config\" stanza specifies the driver configuration, which is passed\n # directly to the driver to start the task. The details of configurations\n # are specific to each driver, so please see specific driver\n # documentation for more information.\n config {\n image = \"minio/mc:RELEASE.2020-12-10T01-26-17Z\"\n entrypoint = [\n \"/bin/sh\",\n \"-c\",\n \"mc config host add LOCALMINIO http://storage.service.consul:9000 $MINIO_ACCESS_KEY $MINIO_SECRET_KEY \u0026\u0026 mc mb -p LOCALMINIO/logs.fd.io LOCALMINIO/docs.fd.io ; mc policy set public LOCALMINIO/logs.fd.io mc policy set public LOCALMINIO/docs.fd.io mc ilm add --expiry-days '180' LOCALMINIO/logs.fd.io mc admin user add LOCALMINIO storage Storage1234 mc admin policy set LOCALMINIO writeonly user=storage\"\n ]\n dns_servers = [ \"${attr.unique.network.ip-address}\" ]\n privileged = false\n }\n\n # The env stanza configures a list of environment variables to populate\n # the task's environment before starting.\n env {\n \n MINIO_ACCESS_KEY = \"minio\"\n MINIO_SECRET_KEY = \"minio123\"\n \n \n }\n }\n }\n}\n",
+ "template": "job \"${job_name}\" {\n # The \"region\" parameter specifies the region in which to execute the job.\n # If omitted, this inherits the default region name of \"global\".\n # region = \"global\"\n #\n # The \"datacenters\" parameter specifies the list of datacenters which should\n # be considered when placing this task. This must be provided.\n datacenters = \"${datacenters}\"\n\n # The \"type\" parameter controls the type of job, which impacts the scheduler's\n # decision on placement. This configuration is optional and defaults to\n # \"service\". For a full list of job types and their differences, please see\n # the online documentation.\n #\n # For more information, please see the online documentation at:\n #\n # https://www.nomadproject.io/docs/jobspec/schedulers.html\n #\n type = \"batch\"\n\n # The \"group\" stanza defines a series of tasks that should be co-located on\n # the same Nomad client. Any task within a group will be placed on the same\n # client.\n #\n # For more information and examples on the \"group\" stanza, please see\n # the online documentation at:\n #\n # https://www.nomadproject.io/docs/job-specification/group.html\n #\n group \"prod-group1-mc\" {\n task \"prod-task1-create-buckets\" {\n # The \"driver\" parameter specifies the task driver that should be used to\n # run the task.\n driver = \"docker\"\n\n %{ if use_vault_provider }\n vault {\n policies = \"${vault_kv_policy_name}\"\n }\n %{ endif }\n\n # The \"config\" stanza specifies the driver configuration, which is passed\n # directly to the driver to start the task. The details of configurations\n # are specific to each driver, so please see specific driver\n # documentation for more information.\n config {\n image = \"${image}\"\n entrypoint = [\n \"/bin/sh\",\n \"-c\",\n \"${command}\"\n ]\n dns_servers = [ \"$${attr.unique.network.ip-address}\" ]\n privileged = false\n }\n\n # The env stanza configures a list of environment variables to populate\n # the task's environment before starting.\n env {\n %{ if use_vault_provider }\n {{ with secret \"${vault_kv_path}\" }}\n MINIO_ACCESS_KEY = \"{{ .Data.data.${vault_kv_field_access_key} }}\"\n MINIO_SECRET_KEY = \"{{ .Data.data.${vault_kv_field_secret_key} }}\"\n {{ end }}\n %{ else }\n MINIO_ACCESS_KEY = \"${access_key}\"\n MINIO_SECRET_KEY = \"${secret_key}\"\n %{ endif }\n ${ envs }\n }\n }\n }\n}\n",
+ "vars": {
+ "access_key": "minio",
+ "command": "mc config host add LOCALMINIO http://storage.service.consul:9000 $MINIO_ACCESS_KEY $MINIO_SECRET_KEY \u0026\u0026 mc mb -p LOCALMINIO/logs.fd.io LOCALMINIO/docs.fd.io ; mc policy set public LOCALMINIO/logs.fd.io mc policy set public LOCALMINIO/docs.fd.io mc ilm add --expiry-days '180' LOCALMINIO/logs.fd.io mc admin user add LOCALMINIO storage Storage1234 mc admin policy set LOCALMINIO writeonly user=storage",
+ "datacenters": "yul1",
+ "envs": "",
+ "image": "minio/mc:RELEASE.2020-12-10T01-26-17Z",
+ "job_name": "prod-mc",
+ "minio_port": "9000",
+ "minio_service_name": "storage",
+ "secret_key": "minio123",
+ "service_name": "mc",
+ "use_vault_provider": "false"
+ }
+ },
+ "sensitive_attributes": []
+ }
+ ]
+ },
+ {
+ "module": "module.minio",
+ "mode": "data",
+ "type": "template_file",
+ "name": "nomad_job_minio",
+ "provider": "provider[\"registry.terraform.io/hashicorp/template\"]",
+ "instances": [
+ {
+ "schema_version": 0,
+ "attributes": {
+ "filename": null,
+ "id": "c38a0b7182d39c70ec07bdaa998f57a215d7193b264c1b1c1a299c84e7ca53de",
+ "rendered": "job \"prod-minio\" {\n # The \"region\" parameter specifies the region in which to execute the job.\n # If omitted, this inherits the default region name of \"global\".\n # region = \"global\"\n #\n # The \"datacenters\" parameter specifies the list of datacenters which should\n # be considered when placing this task. This must be provided.\n datacenters = \"yul1\"\n\n # The \"type\" parameter controls the type of job, which impacts the scheduler's\n # decision on placement. This configuration is optional and defaults to\n # \"service\". For a full list of job types and their differences, please see\n # the online documentation.\n #\n # For more information, please see the online documentation at:\n #\n # https://www.nomadproject.io/docs/jobspec/schedulers.html\n #\n type = \"service\"\n\n update {\n # The \"max_parallel\" parameter specifies the maximum number of updates to\n # perform in parallel. In this case, this specifies to update a single task\n # at a time.\n max_parallel = 1\n\n health_check = \"checks\"\n\n # The \"min_healthy_time\" parameter specifies the minimum time the allocation\n # must be in the healthy state before it is marked as healthy and unblocks\n # further allocations from being updated.\n min_healthy_time = \"10s\"\n\n # The \"healthy_deadline\" parameter specifies the deadline in which the\n # allocation must be marked as healthy after which the allocation is\n # automatically transitioned to unhealthy. Transitioning to unhealthy will\n # fail the deployment and potentially roll back the job if \"auto_revert\" is\n # set to true.\n healthy_deadline = \"3m\"\n\n # The \"progress_deadline\" parameter specifies the deadline in which an\n # allocation must be marked as healthy. The deadline begins when the first\n # allocation for the deployment is created and is reset whenever an allocation\n # as part of the deployment transitions to a healthy state. If no allocation\n # transitions to the healthy state before the progress deadline, the\n # deployment is marked as failed.\n progress_deadline = \"10m\"\n\n\n # The \"canary\" parameter specifies that changes to the job that would result\n # in destructive updates should create the specified number of canaries\n # without stopping any previous allocations. Once the operator determines the\n # canaries are healthy, they can be promoted which unblocks a rolling update\n # of the remaining allocations at a rate of \"max_parallel\".\n #\n # Further, setting \"canary\" equal to the count of the task group allows\n # blue/green deployments. When the job is updated, a full set of the new\n # version is deployed and upon promotion the old version is stopped.\n canary = 1\n\n # Specifies if the job should auto-promote to the canary version when all\n # canaries become healthy during a deployment. Defaults to false which means\n # canaries must be manually updated with the nomad deployment promote\n # command.\n auto_promote = true\n\n # The \"auto_revert\" parameter specifies if the job should auto-revert to the\n # last stable job on deployment failure. A job is marked as stable if all the\n # allocations as part of its deployment were marked healthy.\n auto_revert = true\n\n }\n\n # All groups in this job should be scheduled on different hosts.\n constraint {\n operator = \"distinct_hosts\"\n value = \"true\"\n }\n\n # The \"group\" stanza defines a series of tasks that should be co-located on\n # the same Nomad client. Any task within a group will be placed on the same\n # client.\n #\n # For more information and examples on the \"group\" stanza, please see\n # the online documentation at:\n #\n # https://www.nomadproject.io/docs/job-specification/group.html\n #\n group \"prod-group1-minio\" {\n # The \"count\" parameter specifies the number of the task groups that should\n # be running under this group. This value must be non-negative and defaults\n # to 1.\n count = 4\n\n # https://www.nomadproject.io/docs/job-specification/volume\n \n volume \"prod-volume1-minio\" {\n type = \"host\"\n read_only = false\n source = \"prod-volume-data1-1\"\n }\n \n\n # The \"task\" stanza creates an individual unit of work, such as a Docker\n # container, web application, or batch processing.\n #\n # For more information and examples on the \"task\" stanza, please see\n # the online documentation at:\n #\n # https://www.nomadproject.io/docs/job-specification/task.html\n #\n task \"prod-task1-minio\" {\n # The \"driver\" parameter specifies the task driver that should be used to\n # run the task.\n driver = \"docker\"\n\n \n volume_mount {\n volume = \"prod-volume1-minio\"\n destination = \"/data/\"\n read_only = false\n }\n \n\n \n\n # The \"config\" stanza specifies the driver configuration, which is passed\n # directly to the driver to start the task. The details of configurations\n # are specific to each driver, so please see specific driver\n # documentation for more information.\n config {\n image = \"minio/minio:RELEASE.2020-12-03T05-49-24Z\"\n dns_servers = [ \"${attr.unique.network.ip-address}\" ]\n network_mode = \"host\"\n command = \"server\"\n args = [ \"http://10.32.8.1{4...7}:9000/data/\" ]\n port_map {\n http = 9000\n }\n privileged = false\n }\n\n # The env stanza configures a list of environment variables to populate\n # the task's environment before starting.\n env {\n\n MINIO_ACCESS_KEY = \"minio\"\n MINIO_SECRET_KEY = \"minio123\"\n\n MINIO_BROWSER=\"off\"\n }\n\n # The service stanza instructs Nomad to register a service with Consul.\n #\n # For more information and examples on the \"task\" stanza, please see\n # the online documentation at:\n #\n # https://www.nomadproject.io/docs/job-specification/service.html\n #\n service {\n name = \"storage\"\n port = \"http\"\n tags = [ \"storage${NOMAD_ALLOC_INDEX}\" ]\n check {\n name = \"Min.io Server HTTP Check Live\"\n type = \"http\"\n port = \"http\"\n protocol = \"http\"\n method = \"GET\"\n path = \"/minio/health/live\"\n interval = \"10s\"\n timeout = \"2s\"\n }\n check {\n name = \"Min.io Server HTTP Check Ready\"\n type = \"http\"\n port = \"http\"\n protocol = \"http\"\n method = \"GET\"\n path = \"/minio/health/ready\"\n interval = \"10s\"\n timeout = \"2s\"\n }\n }\n\n # The \"resources\" stanza describes the requirements a task needs to\n # execute. Resource requirements include memory, network, cpu, and more.\n # This ensures the task will execute on a machine that contains enough\n # resource capacity.\n #\n # For more information and examples on the \"resources\" stanza, please see\n # the online documentation at:\n #\n # https://www.nomadproject.io/docs/job-specification/resources.html\n #\n resources {\n cpu = 40000\n memory = 40000\n # The network stanza specifies the networking requirements for the task\n # group, including the network mode and port allocations. When scheduling\n # jobs in Nomad they are provisioned across your fleet of machines along\n # with other jobs and services. Because you don't know in advance what host\n # your job will be provisioned on, Nomad will provide your tasks with\n # network configuration when they start up.\n #\n # For more information and examples on the \"template\" stanza, please see\n # the online documentation at:\n #\n # https://www.nomadproject.io/docs/job-specification/network.html\n #\n network {\n port \"http\" {\n static = 9000\n }\n }\n }\n }\n }\n}\n",
+ "template": "job \"${job_name}\" {\n # The \"region\" parameter specifies the region in which to execute the job.\n # If omitted, this inherits the default region name of \"global\".\n # region = \"global\"\n #\n # The \"datacenters\" parameter specifies the list of datacenters which should\n # be considered when placing this task. This must be provided.\n datacenters = \"${datacenters}\"\n\n # The \"type\" parameter controls the type of job, which impacts the scheduler's\n # decision on placement. This configuration is optional and defaults to\n # \"service\". For a full list of job types and their differences, please see\n # the online documentation.\n #\n # For more information, please see the online documentation at:\n #\n # https://www.nomadproject.io/docs/jobspec/schedulers.html\n #\n type = \"service\"\n\n update {\n # The \"max_parallel\" parameter specifies the maximum number of updates to\n # perform in parallel. In this case, this specifies to update a single task\n # at a time.\n max_parallel = 1\n\n health_check = \"checks\"\n\n # The \"min_healthy_time\" parameter specifies the minimum time the allocation\n # must be in the healthy state before it is marked as healthy and unblocks\n # further allocations from being updated.\n min_healthy_time = \"10s\"\n\n # The \"healthy_deadline\" parameter specifies the deadline in which the\n # allocation must be marked as healthy after which the allocation is\n # automatically transitioned to unhealthy. Transitioning to unhealthy will\n # fail the deployment and potentially roll back the job if \"auto_revert\" is\n # set to true.\n healthy_deadline = \"3m\"\n\n # The \"progress_deadline\" parameter specifies the deadline in which an\n # allocation must be marked as healthy. The deadline begins when the first\n # allocation for the deployment is created and is reset whenever an allocation\n # as part of the deployment transitions to a healthy state. If no allocation\n # transitions to the healthy state before the progress deadline, the\n # deployment is marked as failed.\n progress_deadline = \"10m\"\n\n%{ if use_canary }\n # The \"canary\" parameter specifies that changes to the job that would result\n # in destructive updates should create the specified number of canaries\n # without stopping any previous allocations. Once the operator determines the\n # canaries are healthy, they can be promoted which unblocks a rolling update\n # of the remaining allocations at a rate of \"max_parallel\".\n #\n # Further, setting \"canary\" equal to the count of the task group allows\n # blue/green deployments. When the job is updated, a full set of the new\n # version is deployed and upon promotion the old version is stopped.\n canary = 1\n\n # Specifies if the job should auto-promote to the canary version when all\n # canaries become healthy during a deployment. Defaults to false which means\n # canaries must be manually updated with the nomad deployment promote\n # command.\n auto_promote = true\n\n # The \"auto_revert\" parameter specifies if the job should auto-revert to the\n # last stable job on deployment failure. A job is marked as stable if all the\n # allocations as part of its deployment were marked healthy.\n auto_revert = true\n%{ endif }\n }\n\n # All groups in this job should be scheduled on different hosts.\n constraint {\n operator = \"distinct_hosts\"\n value = \"true\"\n }\n\n # The \"group\" stanza defines a series of tasks that should be co-located on\n # the same Nomad client. Any task within a group will be placed on the same\n # client.\n #\n # For more information and examples on the \"group\" stanza, please see\n # the online documentation at:\n #\n # https://www.nomadproject.io/docs/job-specification/group.html\n #\n group \"prod-group1-minio\" {\n # The \"count\" parameter specifies the number of the task groups that should\n # be running under this group. This value must be non-negative and defaults\n # to 1.\n count = ${group_count}\n\n # https://www.nomadproject.io/docs/job-specification/volume\n %{ if use_host_volume }\n volume \"prod-volume1-minio\" {\n type = \"host\"\n read_only = false\n source = \"${host_volume}\"\n }\n %{ endif }\n\n # The \"task\" stanza creates an individual unit of work, such as a Docker\n # container, web application, or batch processing.\n #\n # For more information and examples on the \"task\" stanza, please see\n # the online documentation at:\n #\n # https://www.nomadproject.io/docs/job-specification/task.html\n #\n task \"prod-task1-minio\" {\n # The \"driver\" parameter specifies the task driver that should be used to\n # run the task.\n driver = \"docker\"\n\n %{ if use_host_volume }\n volume_mount {\n volume = \"prod-volume1-minio\"\n destination = \"${data_dir}\"\n read_only = false\n }\n %{ endif }\n\n %{ if use_vault_provider }\n vault {\n policies = \"${vault_kv_policy_name}\"\n }\n %{ endif }\n\n # The \"config\" stanza specifies the driver configuration, which is passed\n # directly to the driver to start the task. The details of configurations\n # are specific to each driver, so please see specific driver\n # documentation for more information.\n config {\n image = \"${image}\"\n dns_servers = [ \"$${attr.unique.network.ip-address}\" ]\n network_mode = \"host\"\n command = \"server\"\n args = [ \"${host}:${port}${data_dir}\" ]\n port_map {\n http = ${port}\n }\n privileged = false\n }\n\n # The env stanza configures a list of environment variables to populate\n # the task's environment before starting.\n env {\n%{ if use_vault_provider }\n{{ with secret \"${vault_kv_path}\" }}\n MINIO_ACCESS_KEY = \"{{ .Data.data.${vault_kv_field_access_key} }}\"\n MINIO_SECRET_KEY = \"{{ .Data.data.${vault_kv_field_secret_key} }}\"\n{{ end }}\n%{ else }\n MINIO_ACCESS_KEY = \"${access_key}\"\n MINIO_SECRET_KEY = \"${secret_key}\"\n%{ endif }\n ${ envs }\n }\n\n # The service stanza instructs Nomad to register a service with Consul.\n #\n # For more information and examples on the \"task\" stanza, please see\n # the online documentation at:\n #\n # https://www.nomadproject.io/docs/job-specification/service.html\n #\n service {\n name = \"${service_name}\"\n port = \"http\"\n tags = [ \"storage$${NOMAD_ALLOC_INDEX}\" ]\n check {\n name = \"Min.io Server HTTP Check Live\"\n type = \"http\"\n port = \"http\"\n protocol = \"http\"\n method = \"GET\"\n path = \"/minio/health/live\"\n interval = \"10s\"\n timeout = \"2s\"\n }\n check {\n name = \"Min.io Server HTTP Check Ready\"\n type = \"http\"\n port = \"http\"\n protocol = \"http\"\n method = \"GET\"\n path = \"/minio/health/ready\"\n interval = \"10s\"\n timeout = \"2s\"\n }\n }\n\n # The \"resources\" stanza describes the requirements a task needs to\n # execute. Resource requirements include memory, network, cpu, and more.\n # This ensures the task will execute on a machine that contains enough\n # resource capacity.\n #\n # For more information and examples on the \"resources\" stanza, please see\n # the online documentation at:\n #\n # https://www.nomadproject.io/docs/job-specification/resources.html\n #\n resources {\n cpu = ${cpu}\n memory = ${memory}\n # The network stanza specifies the networking requirements for the task\n # group, including the network mode and port allocations. When scheduling\n # jobs in Nomad they are provisioned across your fleet of machines along\n # with other jobs and services. Because you don't know in advance what host\n # your job will be provisioned on, Nomad will provide your tasks with\n # network configuration when they start up.\n #\n # For more information and examples on the \"template\" stanza, please see\n # the online documentation at:\n #\n # https://www.nomadproject.io/docs/job-specification/network.html\n #\n network {\n port \"http\" {\n static = ${port}\n }\n }\n }\n }\n }\n}\n",
+ "vars": {
+ "access_key": "minio",
+ "cpu": "40000",
+ "cpu_proxy": "200",
+ "data_dir": "/data/",
+ "datacenters": "yul1",
+ "envs": "MINIO_BROWSER=\"off\"",
+ "group_count": "4",
+ "host": "http://10.32.8.1{4...7}",
+ "host_volume": "prod-volume-data1-1",
+ "image": "minio/minio:RELEASE.2020-12-03T05-49-24Z",
+ "job_name": "prod-minio",
+ "memory": "40000",
+ "memory_proxy": "128",
+ "port": "9000",
+ "secret_key": "minio123",
+ "service_name": "storage",
+ "upstreams": "[]",
+ "use_canary": "true",
+ "use_host_volume": "true",
+ "use_vault_provider": "false"
+ }
+ },
+ "sensitive_attributes": []
+ }
+ ]
+ },
+ {
+ "module": "module.minio",
+ "mode": "managed",
+ "type": "nomad_job",
+ "name": "nomad_job_mc",
+ "provider": "provider[\"registry.terraform.io/hashicorp/nomad\"].yul1",
+ "instances": [
+ {
+ "schema_version": 0,
+ "attributes": {
+ "allocation_ids": [
+ "5b129de9-a002-a1e5-7676-1f6623a6f4bd"
+ ],
+ "datacenters": [
+ "yul1"
+ ],
+ "deployment_id": "",
+ "deployment_status": "",
+ "deregister_on_destroy": true,
+ "deregister_on_id_change": true,
+ "detach": false,
+ "id": "prod-mc",
+ "jobspec": "job \"prod-mc\" {\n # The \"region\" parameter specifies the region in which to execute the job.\n # If omitted, this inherits the default region name of \"global\".\n # region = \"global\"\n #\n # The \"datacenters\" parameter specifies the list of datacenters which should\n # be considered when placing this task. This must be provided.\n datacenters = \"yul1\"\n\n # The \"type\" parameter controls the type of job, which impacts the scheduler's\n # decision on placement. This configuration is optional and defaults to\n # \"service\". For a full list of job types and their differences, please see\n # the online documentation.\n #\n # For more information, please see the online documentation at:\n #\n # https://www.nomadproject.io/docs/jobspec/schedulers.html\n #\n type = \"batch\"\n\n # The \"group\" stanza defines a series of tasks that should be co-located on\n # the same Nomad client. Any task within a group will be placed on the same\n # client.\n #\n # For more information and examples on the \"group\" stanza, please see\n # the online documentation at:\n #\n # https://www.nomadproject.io/docs/job-specification/group.html\n #\n group \"prod-group1-mc\" {\n task \"prod-task1-create-buckets\" {\n # The \"driver\" parameter specifies the task driver that should be used to\n # run the task.\n driver = \"docker\"\n\n \n\n # The \"config\" stanza specifies the driver configuration, which is passed\n # directly to the driver to start the task. The details of configurations\n # are specific to each driver, so please see specific driver\n # documentation for more information.\n config {\n image = \"minio/mc:RELEASE.2020-12-10T01-26-17Z\"\n entrypoint = [\n \"/bin/sh\",\n \"-c\",\n \"mc config host add LOCALMINIO http://storage.service.consul:9000 $MINIO_ACCESS_KEY $MINIO_SECRET_KEY \u0026\u0026 mc mb -p LOCALMINIO/logs.fd.io LOCALMINIO/docs.fd.io ; mc policy set public LOCALMINIO/logs.fd.io mc policy set public LOCALMINIO/docs.fd.io mc ilm add --expiry-days '180' LOCALMINIO/logs.fd.io mc admin user add LOCALMINIO storage Storage1234 mc admin policy set LOCALMINIO writeonly user=storage\"\n ]\n dns_servers = [ \"${attr.unique.network.ip-address}\" ]\n privileged = false\n }\n\n # The env stanza configures a list of environment variables to populate\n # the task's environment before starting.\n env {\n \n MINIO_ACCESS_KEY = \"minio\"\n MINIO_SECRET_KEY = \"minio123\"\n \n \n }\n }\n }\n}\n",
+ "json": null,
+ "modify_index": "7147950",
+ "name": "prod-mc",
+ "namespace": "default",
+ "policy_override": null,
+ "purge_on_destroy": null,
+ "region": "global",
+ "task_groups": [
+ {
+ "count": 1,
+ "meta": {},
+ "name": "prod-group1-mc",
+ "task": [
+ {
+ "driver": "docker",
+ "meta": {},
+ "name": "prod-task1-create-buckets",
+ "volume_mounts": null
+ }
+ ],
+ "volumes": null
+ }
+ ],
+ "type": "batch"
+ },
+ "sensitive_attributes": [],
+ "private": "bnVsbA==",
+ "dependencies": [
+ "module.minio.data.template_file.nomad_job_mc",
+ "module.minio.data.template_file.nomad_job_minio",
+ "module.minio.nomad_job.nomad_job_minio"
+ ]
+ }
+ ]
+ },
+ {
+ "module": "module.minio",
+ "mode": "managed",
+ "type": "nomad_job",
+ "name": "nomad_job_minio",
+ "provider": "provider[\"registry.terraform.io/hashicorp/nomad\"].yul1",
+ "instances": [
+ {
+ "schema_version": 0,
+ "attributes": {
+ "allocation_ids": [
+ "322e6ea5-acc7-2f37-3022-cd0a31c2f3db",
+ "619833b9-dd70-46ae-f576-75c3cd058a13",
+ "0510561a-c828-dd23-5910-b92d9f2c41ef",
+ "504924c8-387b-abe2-c4d5-d6a7424a4689"
+ ],
+ "datacenters": [
+ "yul1"
+ ],
+ "deployment_id": "cd6d79cf-6e6f-289d-a0bb-6b2bb8da769a",
+ "deployment_status": "successful",
+ "deregister_on_destroy": true,
+ "deregister_on_id_change": true,
+ "detach": false,
+ "id": "prod-minio",
+ "jobspec": "job \"prod-minio\" {\n # The \"region\" parameter specifies the region in which to execute the job.\n # If omitted, this inherits the default region name of \"global\".\n # region = \"global\"\n #\n # The \"datacenters\" parameter specifies the list of datacenters which should\n # be considered when placing this task. This must be provided.\n datacenters = \"yul1\"\n\n # The \"type\" parameter controls the type of job, which impacts the scheduler's\n # decision on placement. This configuration is optional and defaults to\n # \"service\". For a full list of job types and their differences, please see\n # the online documentation.\n #\n # For more information, please see the online documentation at:\n #\n # https://www.nomadproject.io/docs/jobspec/schedulers.html\n #\n type = \"service\"\n\n update {\n # The \"max_parallel\" parameter specifies the maximum number of updates to\n # perform in parallel. In this case, this specifies to update a single task\n # at a time.\n max_parallel = 1\n\n health_check = \"checks\"\n\n # The \"min_healthy_time\" parameter specifies the minimum time the allocation\n # must be in the healthy state before it is marked as healthy and unblocks\n # further allocations from being updated.\n min_healthy_time = \"10s\"\n\n # The \"healthy_deadline\" parameter specifies the deadline in which the\n # allocation must be marked as healthy after which the allocation is\n # automatically transitioned to unhealthy. Transitioning to unhealthy will\n # fail the deployment and potentially roll back the job if \"auto_revert\" is\n # set to true.\n healthy_deadline = \"3m\"\n\n # The \"progress_deadline\" parameter specifies the deadline in which an\n # allocation must be marked as healthy. The deadline begins when the first\n # allocation for the deployment is created and is reset whenever an allocation\n # as part of the deployment transitions to a healthy state. If no allocation\n # transitions to the healthy state before the progress deadline, the\n # deployment is marked as failed.\n progress_deadline = \"10m\"\n\n\n # The \"canary\" parameter specifies that changes to the job that would result\n # in destructive updates should create the specified number of canaries\n # without stopping any previous allocations. Once the operator determines the\n # canaries are healthy, they can be promoted which unblocks a rolling update\n # of the remaining allocations at a rate of \"max_parallel\".\n #\n # Further, setting \"canary\" equal to the count of the task group allows\n # blue/green deployments. When the job is updated, a full set of the new\n # version is deployed and upon promotion the old version is stopped.\n canary = 1\n\n # Specifies if the job should auto-promote to the canary version when all\n # canaries become healthy during a deployment. Defaults to false which means\n # canaries must be manually updated with the nomad deployment promote\n # command.\n auto_promote = true\n\n # The \"auto_revert\" parameter specifies if the job should auto-revert to the\n # last stable job on deployment failure. A job is marked as stable if all the\n # allocations as part of its deployment were marked healthy.\n auto_revert = true\n\n }\n\n # All groups in this job should be scheduled on different hosts.\n constraint {\n operator = \"distinct_hosts\"\n value = \"true\"\n }\n\n # The \"group\" stanza defines a series of tasks that should be co-located on\n # the same Nomad client. Any task within a group will be placed on the same\n # client.\n #\n # For more information and examples on the \"group\" stanza, please see\n # the online documentation at:\n #\n # https://www.nomadproject.io/docs/job-specification/group.html\n #\n group \"prod-group1-minio\" {\n # The \"count\" parameter specifies the number of the task groups that should\n # be running under this group. This value must be non-negative and defaults\n # to 1.\n count = 4\n\n # https://www.nomadproject.io/docs/job-specification/volume\n \n volume \"prod-volume1-minio\" {\n type = \"host\"\n read_only = false\n source = \"prod-volume-data1-1\"\n }\n \n\n # The \"task\" stanza creates an individual unit of work, such as a Docker\n # container, web application, or batch processing.\n #\n # For more information and examples on the \"task\" stanza, please see\n # the online documentation at:\n #\n # https://www.nomadproject.io/docs/job-specification/task.html\n #\n task \"prod-task1-minio\" {\n # The \"driver\" parameter specifies the task driver that should be used to\n # run the task.\n driver = \"docker\"\n\n \n volume_mount {\n volume = \"prod-volume1-minio\"\n destination = \"/data/\"\n read_only = false\n }\n \n\n \n\n # The \"config\" stanza specifies the driver configuration, which is passed\n # directly to the driver to start the task. The details of configurations\n # are specific to each driver, so please see specific driver\n # documentation for more information.\n config {\n image = \"minio/minio:RELEASE.2020-12-03T05-49-24Z\"\n dns_servers = [ \"${attr.unique.network.ip-address}\" ]\n network_mode = \"host\"\n command = \"server\"\n args = [ \"http://10.32.8.1{4...7}:9000/data/\" ]\n port_map {\n http = 9000\n }\n privileged = false\n }\n\n # The env stanza configures a list of environment variables to populate\n # the task's environment before starting.\n env {\n\n MINIO_ACCESS_KEY = \"minio\"\n MINIO_SECRET_KEY = \"minio123\"\n\n MINIO_BROWSER=\"off\"\n }\n\n # The service stanza instructs Nomad to register a service with Consul.\n #\n # For more information and examples on the \"task\" stanza, please see\n # the online documentation at:\n #\n # https://www.nomadproject.io/docs/job-specification/service.html\n #\n service {\n name = \"storage\"\n port = \"http\"\n tags = [ \"storage${NOMAD_ALLOC_INDEX}\" ]\n check {\n name = \"Min.io Server HTTP Check Live\"\n type = \"http\"\n port = \"http\"\n protocol = \"http\"\n method = \"GET\"\n path = \"/minio/health/live\"\n interval = \"10s\"\n timeout = \"2s\"\n }\n check {\n name = \"Min.io Server HTTP Check Ready\"\n type = \"http\"\n port = \"http\"\n protocol = \"http\"\n method = \"GET\"\n path = \"/minio/health/ready\"\n interval = \"10s\"\n timeout = \"2s\"\n }\n }\n\n # The \"resources\" stanza describes the requirements a task needs to\n # execute. Resource requirements include memory, network, cpu, and more.\n # This ensures the task will execute on a machine that contains enough\n # resource capacity.\n #\n # For more information and examples on the \"resources\" stanza, please see\n # the online documentation at:\n #\n # https://www.nomadproject.io/docs/job-specification/resources.html\n #\n resources {\n # The network stanza specifies the networking requirements for the task\n # group, including the network mode and port allocations. When scheduling\n # jobs in Nomad they are provisioned across your fleet of machines along\n # with other jobs and services. Because you don't know in advance what host\n # your job will be provisioned on, Nomad will provide your tasks with\n # network configuration when they start up.\n #\n # For more information and examples on the \"template\" stanza, please see\n # the online documentation at:\n #\n # https://www.nomadproject.io/docs/job-specification/network.html\n #\n cpu = 40000\n memory = 40000\n network {\n port \"http\" {\n static = 9000\n }\n }\n }\n }\n }\n}\n",
+ "json": null,
+ "modify_index": "5933603",
+ "name": "prod-minio",
+ "namespace": "default",
+ "policy_override": null,
+ "purge_on_destroy": null,
+ "region": "global",
+ "task_groups": [
+ {
+ "count": 4,
+ "meta": {},
+ "name": "prod-group1-minio",
+ "task": [
+ {
+ "driver": "docker",
+ "meta": {},
+ "name": "prod-task1-minio",
+ "volume_mounts": [
+ {
+ "destination": "/data/",
+ "read_only": false,
+ "volume": "prod-volume1-minio"
+ }
+ ]
+ }
+ ],
+ "volumes": [
+ {
+ "name": "prod-volume1-minio",
+ "read_only": false,
+ "source": "prod-volume-data1-1",
+ "type": "host"
+ }
+ ]
+ }
+ ],
+ "type": "service"
+ },
+ "sensitive_attributes": [],
+ "private": "bnVsbA==",
+ "dependencies": [
+ "module.minio.data.template_file.nomad_job_minio"
+ ]
+ }
+ ]
+ },
+ {
+ "module": "module.nginx",
+ "mode": "data",
+ "type": "template_file",
+ "name": "nomad_job_nginx",
+ "provider": "provider[\"registry.terraform.io/hashicorp/template\"]",
+ "instances": [
+ {
+ "schema_version": 0,
+ "attributes": {
+ "filename": null,
+ "id": "2816603491214d9abe10981b9f9cc6a4cb806acbc6981b952653847fec86e9ff",
+ "rendered": "job \"prod-nginx\" {\n # The \"region\" parameter specifies the region in which to execute the job.\n # If omitted, this inherits the default region name of \"global\".\n # region = \"global\"\n #\n # The \"datacenters\" parameter specifies the list of datacenters which should\n # be considered when placing this task. This must be provided.\n datacenters = \"yul1\"\n\n # The \"type\" parameter controls the type of job, which impacts the scheduler's\n # decision on placement. This configuration is optional and defaults to\n # \"service\". For a full list of job types and their differences, please see\n # the online documentation.\n #\n # For more information, please see the online documentation at:\n #\n # https://www.nomadproject.io/docs/jobspec/schedulers.html\n #\n type = \"service\"\n\n update {\n # The \"max_parallel\" parameter specifies the maximum number of updates to\n # perform in parallel. In this case, this specifies to update a single task\n # at a time.\n max_parallel = 0\n\n # The \"min_healthy_time\" parameter specifies the minimum time the allocation\n # must be in the healthy state before it is marked as healthy and unblocks\n # further allocations from being updated.\n min_healthy_time = \"10s\"\n\n # The \"healthy_deadline\" parameter specifies the deadline in which the\n # allocation must be marked as healthy after which the allocation is\n # automatically transitioned to unhealthy. Transitioning to unhealthy will\n # fail the deployment and potentially roll back the job if \"auto_revert\" is\n # set to true.\n healthy_deadline = \"3m\"\n\n # The \"progress_deadline\" parameter specifies the deadline in which an\n # allocation must be marked as healthy. The deadline begins when the first\n # allocation for the deployment is created and is reset whenever an allocation\n # as part of the deployment transitions to a healthy state. If no allocation\n # transitions to the healthy state before the progress deadline, the\n # deployment is marked as failed.\n progress_deadline = \"10m\"\n\n # The \"auto_revert\" parameter specifies if the job should auto-revert to the\n # last stable job on deployment failure. A job is marked as stable if all the\n # allocations as part of its deployment were marked healthy.\n auto_revert = false\n\n # The \"canary\" parameter specifies that changes to the job that would result\n # in destructive updates should create the specified number of canaries\n # without stopping any previous allocations. Once the operator determines the\n # canaries are healthy, they can be promoted which unblocks a rolling update\n # of the remaining allocations at a rate of \"max_parallel\".\n #\n # Further, setting \"canary\" equal to the count of the task group allows\n # blue/green deployments. When the job is updated, a full set of the new\n # version is deployed and upon promotion the old version is stopped.\n canary = 0\n }\n\n # The reschedule stanza specifies the group's rescheduling strategy. If\n # specified at the job level, the configuration will apply to all groups\n # within the job. If the reschedule stanza is present on both the job and the\n # group, they are merged with the group stanza taking the highest precedence\n # and then the job.\n reschedule {\n delay = \"30s\"\n delay_function = \"constant\"\n unlimited = true\n }\n\n # The \"group\" stanza defines a series of tasks that should be co-located on\n # the same Nomad client. Any task within a group will be placed on the same\n # client.\n #\n # For more information and examples on the \"group\" stanza, please see\n # the online documentation at:\n #\n # https://www.nomadproject.io/docs/job-specification/group.html\n #\n group \"prod-group1-nginx\" {\n # The \"count\" parameter specifies the number of the task groups that should\n # be running under this group. This value must be non-negative and defaults\n # to 1.\n count = 1\n\n # The restart stanza configures a tasks's behavior on task failure. Restarts\n # happen on the client that is running the task.\n restart {\n interval = \"10m\"\n attempts = 2\n delay = \"15s\"\n mode = \"fail\"\n }\n\n # The \"task\" stanza creates an individual unit of work, such as a Docker\n # container, web application, or batch processing.\n #\n # For more information and examples on the \"task\" stanza, please see\n # the online documentation at:\n #\n # https://www.nomadproject.io/docs/job-specification/task.html\n #\n task \"prod-task1-nginx\" {\n # The \"driver\" parameter specifies the task driver that should be used to\n # run the task.\n driver = \"docker\"\n\n # The \"config\" stanza specifies the driver configuration, which is passed\n # directly to the driver to start the task. The details of configurations\n # are specific to each driver, so please see specific driver\n # documentation for more information.\n config {\n image = \"nginx:stable\"\n dns_servers = [ \"${attr.unique.network.ip-address}\" ]\n port_map {\n https = 443\n }\n privileged = false\n volumes = [\n \"/etc/consul.d/ssl/consul.pem:/etc/ssl/certs/nginx-cert.pem\",\n \"/etc/consul.d/ssl/consul-key.pem:/etc/ssl/private/nginx-key.pem\",\n \"custom/upstream.conf:/etc/nginx/conf.d/upstream.conf\",\n \"custom/logs.conf:/etc/nginx/conf.d/logs.conf\",\n \"custom/docs.conf:/etc/nginx/conf.d/docs.conf\"\n ]\n }\n\n # The \"template\" stanza instructs Nomad to manage a template, such as\n # a configuration file or script. This template can optionally pull data\n # from Consul or Vault to populate runtime configuration data.\n #\n # For more information and examples on the \"template\" stanza, please see\n # the online documentation at:\n #\n # https://www.nomadproject.io/docs/job-specification/template.html\n #\n template {\n data = \u003c\u003cEOH\n upstream storage {\n server storage0.storage.service.consul:9000;\n server storage1.storage.service.consul:9000;\n server storage2.storage.service.consul:9000;\n server storage3.storage.service.consul:9000;\n }\n EOH\n destination = \"custom/upstream.conf\"\n }\n template {\n data = \u003c\u003cEOH\n server {\n listen 443 ssl default_server;\n server_name logs.nginx.service.consul;\n keepalive_timeout 70;\n ssl_session_cache shared:SSL:10m;\n ssl_session_timeout 10m;\n ssl_protocols TLSv1.2;\n ssl_prefer_server_ciphers on;\n ssl_ciphers \"ECDHE-ECDSA-AES256-GCM-SHA384:ECDHE-RSA-AES256-GCM-SHA384:ECDHE-ECDSA-CHACHA20-POLY1305:ECDHE-RSA-CHACHA20-POLY1305:ECDHE-ECDSA-AES256-SHA384:ECDHE-RSA-AES256-SHA384\";\n ssl_certificate /etc/ssl/certs/nginx-cert.pem;\n ssl_certificate_key /etc/ssl/private/nginx-key.pem;\n location / {\n chunked_transfer_encoding off;\n proxy_connect_timeout 300;\n proxy_http_version 1.1;\n proxy_set_header Host $host:$server_port;\n proxy_set_header Connection \"\";\n proxy_pass http://storage/logs.fd.io/;\n server_name_in_redirect off;\n }\n location ~ (.*html.gz)$ {\n add_header Content-Encoding gzip;\n add_header Content-Type text/html;\n chunked_transfer_encoding off;\n proxy_connect_timeout 300;\n proxy_http_version 1.1;\n proxy_set_header Host $host:$server_port;\n proxy_set_header Connection \"\";\n proxy_pass http://storage/logs.fd.io/$1;\n server_name_in_redirect off;\n }\n location ~ (.*txt.gz|.*log.gz)$ {\n add_header Content-Encoding gzip;\n add_header Content-Type text/plain;\n chunked_transfer_encoding off;\n proxy_connect_timeout 300;\n proxy_http_version 1.1;\n proxy_set_header Host $host:$server_port;\n proxy_set_header Connection \"\";\n proxy_pass http://storage/logs.fd.io/$1;\n server_name_in_redirect off;\n }\n location ~ (.*xml.gz)$ {\n add_header Content-Encoding gzip;\n add_header Content-Type application/xml;\n chunked_transfer_encoding off;\n proxy_connect_timeout 300;\n proxy_http_version 1.1;\n proxy_set_header Host $host:$server_port;\n proxy_set_header Connection \"\";\n proxy_pass http://storage/logs.fd.io/$1;\n server_name_in_redirect off;\n }\n }\n EOH\n destination = \"custom/logs.conf\"\n }\n template {\n data = \u003c\u003cEOH\n server {\n listen 443 ssl;\n server_name docs.nginx.service.consul;\n keepalive_timeout 70;\n ssl_session_cache shared:SSL:10m;\n ssl_session_timeout 10m;\n ssl_protocols TLSv1.2;\n ssl_prefer_server_ciphers on;\n ssl_ciphers \"ECDHE-ECDSA-AES256-GCM-SHA384:ECDHE-RSA-AES256-GCM-SHA384:ECDHE-ECDSA-CHACHA20-POLY1305:ECDHE-RSA-CHACHA20-POLY1305:ECDHE-ECDSA-AES256-SHA384:ECDHE-RSA-AES256-SHA384\";\n ssl_certificate /etc/ssl/certs/nginx-cert.pem;\n ssl_certificate_key /etc/ssl/private/nginx-key.pem;\n location / {\n chunked_transfer_encoding off;\n proxy_connect_timeout 300;\n proxy_http_version 1.1;\n proxy_set_header Host $host:$server_port;\n proxy_set_header Connection \"\";\n proxy_pass http://storage/docs.fd.io/;\n server_name_in_redirect off;\n }\n }\n EOH\n destination = \"custom/docs.conf\"\n }\n\n # The service stanza instructs Nomad to register a service with Consul.\n #\n # For more information and examples on the \"task\" stanza, please see\n # the online documentation at:\n #\n # https://www.nomadproject.io/docs/job-specification/service.html\n #\n service {\n name = \"nginx\"\n port = \"https\"\n tags = [ \"docs\", \"logs\" ]\n }\n\n # The \"resources\" stanza describes the requirements a task needs to\n # execute. Resource requirements include memory, network, cpu, and more.\n # This ensures the task will execute on a machine that contains enough\n # resource capacity.\n #\n # For more information and examples on the \"resources\" stanza, please see\n # the online documentation at:\n #\n # https://www.nomadproject.io/docs/job-specification/resources.html\n #\n resources {\n cpu = 1000\n memory = 1024\n network {\n mode = \"bridge\"\n port \"https\" {\n static = 443\n }\n }\n }\n }\n }\n}",
+ "template": "job \"${job_name}\" {\n # The \"region\" parameter specifies the region in which to execute the job.\n # If omitted, this inherits the default region name of \"global\".\n # region = \"global\"\n #\n # The \"datacenters\" parameter specifies the list of datacenters which should\n # be considered when placing this task. This must be provided.\n datacenters = \"${datacenters}\"\n\n # The \"type\" parameter controls the type of job, which impacts the scheduler's\n # decision on placement. This configuration is optional and defaults to\n # \"service\". For a full list of job types and their differences, please see\n # the online documentation.\n #\n # For more information, please see the online documentation at:\n #\n # https://www.nomadproject.io/docs/jobspec/schedulers.html\n #\n type = \"service\"\n\n update {\n # The \"max_parallel\" parameter specifies the maximum number of updates to\n # perform in parallel. In this case, this specifies to update a single task\n # at a time.\n max_parallel = 0\n\n # The \"min_healthy_time\" parameter specifies the minimum time the allocation\n # must be in the healthy state before it is marked as healthy and unblocks\n # further allocations from being updated.\n min_healthy_time = \"10s\"\n\n # The \"healthy_deadline\" parameter specifies the deadline in which the\n # allocation must be marked as healthy after which the allocation is\n # automatically transitioned to unhealthy. Transitioning to unhealthy will\n # fail the deployment and potentially roll back the job if \"auto_revert\" is\n # set to true.\n healthy_deadline = \"3m\"\n\n # The \"progress_deadline\" parameter specifies the deadline in which an\n # allocation must be marked as healthy. The deadline begins when the first\n # allocation for the deployment is created and is reset whenever an allocation\n # as part of the deployment transitions to a healthy state. If no allocation\n # transitions to the healthy state before the progress deadline, the\n # deployment is marked as failed.\n progress_deadline = \"10m\"\n\n # The \"auto_revert\" parameter specifies if the job should auto-revert to the\n # last stable job on deployment failure. A job is marked as stable if all the\n # allocations as part of its deployment were marked healthy.\n auto_revert = false\n\n # The \"canary\" parameter specifies that changes to the job that would result\n # in destructive updates should create the specified number of canaries\n # without stopping any previous allocations. Once the operator determines the\n # canaries are healthy, they can be promoted which unblocks a rolling update\n # of the remaining allocations at a rate of \"max_parallel\".\n #\n # Further, setting \"canary\" equal to the count of the task group allows\n # blue/green deployments. When the job is updated, a full set of the new\n # version is deployed and upon promotion the old version is stopped.\n canary = 0\n }\n\n # The reschedule stanza specifies the group's rescheduling strategy. If\n # specified at the job level, the configuration will apply to all groups\n # within the job. If the reschedule stanza is present on both the job and the\n # group, they are merged with the group stanza taking the highest precedence\n # and then the job.\n reschedule {\n delay = \"30s\"\n delay_function = \"constant\"\n unlimited = true\n }\n\n # The \"group\" stanza defines a series of tasks that should be co-located on\n # the same Nomad client. Any task within a group will be placed on the same\n # client.\n #\n # For more information and examples on the \"group\" stanza, please see\n # the online documentation at:\n #\n # https://www.nomadproject.io/docs/job-specification/group.html\n #\n group \"prod-group1-nginx\" {\n # The \"count\" parameter specifies the number of the task groups that should\n # be running under this group. This value must be non-negative and defaults\n # to 1.\n count = 1\n\n # The restart stanza configures a tasks's behavior on task failure. Restarts\n # happen on the client that is running the task.\n restart {\n interval = \"10m\"\n attempts = 2\n delay = \"15s\"\n mode = \"fail\"\n }\n\n # The \"task\" stanza creates an individual unit of work, such as a Docker\n # container, web application, or batch processing.\n #\n # For more information and examples on the \"task\" stanza, please see\n # the online documentation at:\n #\n # https://www.nomadproject.io/docs/job-specification/task.html\n #\n task \"prod-task1-nginx\" {\n # The \"driver\" parameter specifies the task driver that should be used to\n # run the task.\n driver = \"docker\"\n\n # The \"config\" stanza specifies the driver configuration, which is passed\n # directly to the driver to start the task. The details of configurations\n # are specific to each driver, so please see specific driver\n # documentation for more information.\n config {\n image = \"nginx:stable\"\n dns_servers = [ \"$${attr.unique.network.ip-address}\" ]\n port_map {\n https = 443\n }\n privileged = false\n volumes = [\n \"/etc/consul.d/ssl/consul.pem:/etc/ssl/certs/nginx-cert.pem\",\n \"/etc/consul.d/ssl/consul-key.pem:/etc/ssl/private/nginx-key.pem\",\n \"custom/upstream.conf:/etc/nginx/conf.d/upstream.conf\",\n \"custom/logs.conf:/etc/nginx/conf.d/logs.conf\",\n \"custom/docs.conf:/etc/nginx/conf.d/docs.conf\"\n ]\n }\n\n # The \"template\" stanza instructs Nomad to manage a template, such as\n # a configuration file or script. This template can optionally pull data\n # from Consul or Vault to populate runtime configuration data.\n #\n # For more information and examples on the \"template\" stanza, please see\n # the online documentation at:\n #\n # https://www.nomadproject.io/docs/job-specification/template.html\n #\n template {\n data = \u003c\u003cEOH\n upstream storage {\n server storage0.storage.service.consul:9000;\n server storage1.storage.service.consul:9000;\n server storage2.storage.service.consul:9000;\n server storage3.storage.service.consul:9000;\n }\n EOH\n destination = \"custom/upstream.conf\"\n }\n template {\n data = \u003c\u003cEOH\n server {\n listen 443 ssl default_server;\n server_name logs.nginx.service.consul;\n keepalive_timeout 70;\n ssl_session_cache shared:SSL:10m;\n ssl_session_timeout 10m;\n ssl_protocols TLSv1.2;\n ssl_prefer_server_ciphers on;\n ssl_ciphers \"ECDHE-ECDSA-AES256-GCM-SHA384:ECDHE-RSA-AES256-GCM-SHA384:ECDHE-ECDSA-CHACHA20-POLY1305:ECDHE-RSA-CHACHA20-POLY1305:ECDHE-ECDSA-AES256-SHA384:ECDHE-RSA-AES256-SHA384\";\n ssl_certificate /etc/ssl/certs/nginx-cert.pem;\n ssl_certificate_key /etc/ssl/private/nginx-key.pem;\n location / {\n chunked_transfer_encoding off;\n proxy_connect_timeout 300;\n proxy_http_version 1.1;\n proxy_set_header Host $host:$server_port;\n proxy_set_header Connection \"\";\n proxy_pass http://storage/logs.fd.io/;\n server_name_in_redirect off;\n }\n location ~ (.*html.gz)$ {\n add_header Content-Encoding gzip;\n add_header Content-Type text/html;\n chunked_transfer_encoding off;\n proxy_connect_timeout 300;\n proxy_http_version 1.1;\n proxy_set_header Host $host:$server_port;\n proxy_set_header Connection \"\";\n proxy_pass http://storage/logs.fd.io/$1;\n server_name_in_redirect off;\n }\n location ~ (.*txt.gz|.*log.gz)$ {\n add_header Content-Encoding gzip;\n add_header Content-Type text/plain;\n chunked_transfer_encoding off;\n proxy_connect_timeout 300;\n proxy_http_version 1.1;\n proxy_set_header Host $host:$server_port;\n proxy_set_header Connection \"\";\n proxy_pass http://storage/logs.fd.io/$1;\n server_name_in_redirect off;\n }\n location ~ (.*xml.gz)$ {\n add_header Content-Encoding gzip;\n add_header Content-Type application/xml;\n chunked_transfer_encoding off;\n proxy_connect_timeout 300;\n proxy_http_version 1.1;\n proxy_set_header Host $host:$server_port;\n proxy_set_header Connection \"\";\n proxy_pass http://storage/logs.fd.io/$1;\n server_name_in_redirect off;\n }\n }\n EOH\n destination = \"custom/logs.conf\"\n }\n template {\n data = \u003c\u003cEOH\n server {\n listen 443 ssl;\n server_name docs.nginx.service.consul;\n keepalive_timeout 70;\n ssl_session_cache shared:SSL:10m;\n ssl_session_timeout 10m;\n ssl_protocols TLSv1.2;\n ssl_prefer_server_ciphers on;\n ssl_ciphers \"ECDHE-ECDSA-AES256-GCM-SHA384:ECDHE-RSA-AES256-GCM-SHA384:ECDHE-ECDSA-CHACHA20-POLY1305:ECDHE-RSA-CHACHA20-POLY1305:ECDHE-ECDSA-AES256-SHA384:ECDHE-RSA-AES256-SHA384\";\n ssl_certificate /etc/ssl/certs/nginx-cert.pem;\n ssl_certificate_key /etc/ssl/private/nginx-key.pem;\n location / {\n chunked_transfer_encoding off;\n proxy_connect_timeout 300;\n proxy_http_version 1.1;\n proxy_set_header Host $host:$server_port;\n proxy_set_header Connection \"\";\n proxy_pass http://storage/docs.fd.io/;\n server_name_in_redirect off;\n }\n }\n EOH\n destination = \"custom/docs.conf\"\n }\n\n # The service stanza instructs Nomad to register a service with Consul.\n #\n # For more information and examples on the \"task\" stanza, please see\n # the online documentation at:\n #\n # https://www.nomadproject.io/docs/job-specification/service.html\n #\n service {\n name = \"nginx\"\n port = \"https\"\n tags = [ \"docs\", \"logs\" ]\n }\n\n # The \"resources\" stanza describes the requirements a task needs to\n # execute. Resource requirements include memory, network, cpu, and more.\n # This ensures the task will execute on a machine that contains enough\n # resource capacity.\n #\n # For more information and examples on the \"resources\" stanza, please see\n # the online documentation at:\n #\n # https://www.nomadproject.io/docs/job-specification/resources.html\n #\n resources {\n cpu = 1000\n memory = 1024\n network {\n mode = \"bridge\"\n port \"https\" {\n static = 443\n }\n }\n }\n }\n }\n}",
+ "vars": {
+ "datacenters": "yul1",
+ "job_name": "prod-nginx"
+ }
+ },
+ "sensitive_attributes": []
+ }
+ ]
+ },
+ {
+ "module": "module.nginx",
+ "mode": "managed",
+ "type": "nomad_job",
+ "name": "nomad_job_nginx",
+ "provider": "provider[\"registry.terraform.io/hashicorp/nomad\"].yul1",
+ "instances": [
+ {
+ "schema_version": 0,
+ "attributes": {
+ "allocation_ids": [
+ "1dea6de0-d5cd-2a1b-fb42-0468e986e0d6"
+ ],
+ "datacenters": [
+ "yul1"
+ ],
+ "deployment_id": "",
+ "deployment_status": "",
+ "deregister_on_destroy": true,
+ "deregister_on_id_change": true,
+ "detach": false,
+ "id": "prod-nginx",
+ "jobspec": "job \"prod-nginx\" {\n # The \"region\" parameter specifies the region in which to execute the job.\n # If omitted, this inherits the default region name of \"global\".\n # region = \"global\"\n #\n # The \"datacenters\" parameter specifies the list of datacenters which should\n # be considered when placing this task. This must be provided.\n datacenters = \"yul1\"\n\n # The \"type\" parameter controls the type of job, which impacts the scheduler's\n # decision on placement. This configuration is optional and defaults to\n # \"service\". For a full list of job types and their differences, please see\n # the online documentation.\n #\n # For more information, please see the online documentation at:\n #\n # https://www.nomadproject.io/docs/jobspec/schedulers.html\n #\n type = \"service\"\n\n update {\n # The \"max_parallel\" parameter specifies the maximum number of updates to\n # perform in parallel. In this case, this specifies to update a single task\n # at a time.\n max_parallel = 0\n\n # The \"min_healthy_time\" parameter specifies the minimum time the allocation\n # must be in the healthy state before it is marked as healthy and unblocks\n # further allocations from being updated.\n min_healthy_time = \"10s\"\n\n # The \"healthy_deadline\" parameter specifies the deadline in which the\n # allocation must be marked as healthy after which the allocation is\n # automatically transitioned to unhealthy. Transitioning to unhealthy will\n # fail the deployment and potentially roll back the job if \"auto_revert\" is\n # set to true.\n healthy_deadline = \"3m\"\n\n # The \"progress_deadline\" parameter specifies the deadline in which an\n # allocation must be marked as healthy. The deadline begins when the first\n # allocation for the deployment is created and is reset whenever an allocation\n # as part of the deployment transitions to a healthy state. If no allocation\n # transitions to the healthy state before the progress deadline, the\n # deployment is marked as failed.\n progress_deadline = \"10m\"\n\n # The \"auto_revert\" parameter specifies if the job should auto-revert to the\n # last stable job on deployment failure. A job is marked as stable if all the\n # allocations as part of its deployment were marked healthy.\n auto_revert = false\n\n # The \"canary\" parameter specifies that changes to the job that would result\n # in destructive updates should create the specified number of canaries\n # without stopping any previous allocations. Once the operator determines the\n # canaries are healthy, they can be promoted which unblocks a rolling update\n # of the remaining allocations at a rate of \"max_parallel\".\n #\n # Further, setting \"canary\" equal to the count of the task group allows\n # blue/green deployments. When the job is updated, a full set of the new\n # version is deployed and upon promotion the old version is stopped.\n canary = 0\n }\n\n # The reschedule stanza specifies the group's rescheduling strategy. If\n # specified at the job level, the configuration will apply to all groups\n # within the job. If the reschedule stanza is present on both the job and the\n # group, they are merged with the group stanza taking the highest precedence\n # and then the job.\n reschedule {\n delay = \"30s\"\n delay_function = \"constant\"\n unlimited = true\n }\n\n # The \"group\" stanza defines a series of tasks that should be co-located on\n # the same Nomad client. Any task within a group will be placed on the same\n # client.\n #\n # For more information and examples on the \"group\" stanza, please see\n # the online documentation at:\n #\n # https://www.nomadproject.io/docs/job-specification/group.html\n #\n group \"prod-group1-nginx\" {\n # The \"count\" parameter specifies the number of the task groups that should\n # be running under this group. This value must be non-negative and defaults\n # to 1.\n count = 1\n\n # The restart stanza configures a tasks's behavior on task failure. Restarts\n # happen on the client that is running the task.\n restart {\n interval = \"10m\"\n attempts = 2\n delay = \"15s\"\n mode = \"fail\"\n }\n\n # The \"task\" stanza creates an individual unit of work, such as a Docker\n # container, web application, or batch processing.\n #\n # For more information and examples on the \"task\" stanza, please see\n # the online documentation at:\n #\n # https://www.nomadproject.io/docs/job-specification/task.html\n #\n task \"prod-task1-nginx\" {\n # The \"driver\" parameter specifies the task driver that should be used to\n # run the task.\n driver = \"docker\"\n\n # The \"config\" stanza specifies the driver configuration, which is passed\n # directly to the driver to start the task. The details of configurations\n # are specific to each driver, so please see specific driver\n # documentation for more information.\n config {\n image = \"nginx:stable\"\n dns_servers = [ \"${attr.unique.network.ip-address}\" ]\n port_map {\n https = 443\n }\n privileged = false\n volumes = [\n \"/etc/consul.d/ssl/consul.pem:/etc/ssl/certs/nginx-cert.pem\",\n \"/etc/consul.d/ssl/consul-key.pem:/etc/ssl/private/nginx-key.pem\",\n \"custom/upstream.conf:/etc/nginx/conf.d/upstream.conf\",\n \"custom/logs.conf:/etc/nginx/conf.d/logs.conf\",\n \"custom/docs.conf:/etc/nginx/conf.d/docs.conf\"\n ]\n }\n\n # The \"template\" stanza instructs Nomad to manage a template, such as\n # a configuration file or script. This template can optionally pull data\n # from Consul or Vault to populate runtime configuration data.\n #\n # For more information and examples on the \"template\" stanza, please see\n # the online documentation at:\n #\n # https://www.nomadproject.io/docs/job-specification/template.html\n #\n template {\n data = \u003c\u003cEOH\n upstream storage {\n server storage0.storage.service.consul:9000;\n server storage1.storage.service.consul:9000;\n server storage2.storage.service.consul:9000;\n server storage3.storage.service.consul:9000;\n }\n EOH\n destination = \"custom/upstream.conf\"\n }\n template {\n data = \u003c\u003cEOH\n server {\n listen 443 ssl default_server;\n server_name logs.nginx.service.consul;\n keepalive_timeout 70;\n ssl_session_cache shared:SSL:10m;\n ssl_session_timeout 10m;\n ssl_protocols TLSv1.2;\n ssl_prefer_server_ciphers on;\n ssl_ciphers \"ECDHE-ECDSA-AES256-GCM-SHA384:ECDHE-RSA-AES256-GCM-SHA384:ECDHE-ECDSA-CHACHA20-POLY1305:ECDHE-RSA-CHACHA20-POLY1305:ECDHE-ECDSA-AES256-SHA384:ECDHE-RSA-AES256-SHA384\";\n ssl_certificate /etc/ssl/certs/nginx-cert.pem;\n ssl_certificate_key /etc/ssl/private/nginx-key.pem;\n location / {\n chunked_transfer_encoding off;\n proxy_connect_timeout 300;\n proxy_http_version 1.1;\n proxy_set_header Host $host:$server_port;\n proxy_set_header Connection \"\";\n proxy_pass http://storage/logs.fd.io/;\n server_name_in_redirect off;\n }\n location ~ (.*html.gz)$ {\n add_header Content-Encoding gzip;\n add_header Content-Type text/html;\n chunked_transfer_encoding off;\n proxy_connect_timeout 300;\n proxy_http_version 1.1;\n proxy_set_header Host $host:$server_port;\n proxy_set_header Connection \"\";\n proxy_pass http://storage/logs.fd.io/$1;\n server_name_in_redirect off;\n }\n location ~ (.*txt.gz|.*log.gz)$ {\n add_header Content-Encoding gzip;\n add_header Content-Type text/plain;\n chunked_transfer_encoding off;\n proxy_connect_timeout 300;\n proxy_http_version 1.1;\n proxy_set_header Host $host:$server_port;\n proxy_set_header Connection \"\";\n proxy_pass http://storage/logs.fd.io/$1;\n server_name_in_redirect off;\n }\n location ~ (.*xml.gz)$ {\n add_header Content-Encoding gzip;\n add_header Content-Type application/xml;\n chunked_transfer_encoding off;\n proxy_connect_timeout 300;\n proxy_http_version 1.1;\n proxy_set_header Host $host:$server_port;\n proxy_set_header Connection \"\";\n proxy_pass http://storage/logs.fd.io/$1;\n server_name_in_redirect off;\n }\n }\n EOH\n destination = \"custom/logs.conf\"\n }\n template {\n data = \u003c\u003cEOH\n server {\n listen 443 ssl;\n server_name docs.nginx.service.consul;\n keepalive_timeout 70;\n ssl_session_cache shared:SSL:10m;\n ssl_session_timeout 10m;\n ssl_protocols TLSv1.2;\n ssl_prefer_server_ciphers on;\n ssl_ciphers \"ECDHE-ECDSA-AES256-GCM-SHA384:ECDHE-RSA-AES256-GCM-SHA384:ECDHE-ECDSA-CHACHA20-POLY1305:ECDHE-RSA-CHACHA20-POLY1305:ECDHE-ECDSA-AES256-SHA384:ECDHE-RSA-AES256-SHA384\";\n ssl_certificate /etc/ssl/certs/nginx-cert.pem;\n ssl_certificate_key /etc/ssl/private/nginx-key.pem;\n location / {\n chunked_transfer_encoding off;\n proxy_connect_timeout 300;\n proxy_http_version 1.1;\n proxy_set_header Host $host:$server_port;\n proxy_set_header Connection \"\";\n proxy_pass http://storage/docs.fd.io/;\n server_name_in_redirect off;\n }\n }\n EOH\n destination = \"custom/docs.conf\"\n }\n\n # The service stanza instructs Nomad to register a service with Consul.\n #\n # For more information and examples on the \"task\" stanza, please see\n # the online documentation at:\n #\n # https://www.nomadproject.io/docs/job-specification/service.html\n #\n service {\n name = \"nginx\"\n port = \"https\"\n tags = [ \"docs\", \"logs\" ]\n }\n\n # The \"resources\" stanza describes the requirements a task needs to\n # execute. Resource requirements include memory, network, cpu, and more.\n # This ensures the task will execute on a machine that contains enough\n # resource capacity.\n #\n # For more information and examples on the \"resources\" stanza, please see\n # the online documentation at:\n #\n # https://www.nomadproject.io/docs/job-specification/resources.html\n #\n resources {\n cpu = 1000\n memory = 1024\n network {\n mode = \"bridge\"\n port \"https\" {\n static = 443\n }\n }\n }\n }\n }\n}",
+ "json": null,
+ "modify_index": "5922474",
+ "name": "prod-nginx",
+ "namespace": "default",
+ "policy_override": null,
+ "purge_on_destroy": null,
+ "region": "global",
+ "task_groups": [
+ {
+ "count": 1,
+ "meta": {},
+ "name": "prod-group1-nginx",
+ "task": [
+ {
+ "driver": "docker",
+ "meta": {},
+ "name": "prod-task1-nginx",
+ "volume_mounts": []
+ }
+ ],
+ "volumes": []
+ }
+ ],
+ "type": "service"
+ },
+ "sensitive_attributes": [],
+ "private": "bnVsbA==",
+ "dependencies": [
+ "module.nginx.data.template_file.nomad_job_nginx"
+ ]
+ }
+ ]
+ },
+ {
+ "module": "module.prometheus",
+ "mode": "data",
+ "type": "template_file",
+ "name": "nomad_job_prometheus",
+ "provider": "provider[\"registry.terraform.io/hashicorp/template\"]",
+ "instances": [
+ {
+ "schema_version": 0,
+ "attributes": {
+ "filename": null,
+ "id": "cd26d062785357d96c95d6f8f40b2d8d7b430be3399de176e6b250822c4af86c",
+ "rendered": "job \"prod-prometheus\" {\n # The \"region\" parameter specifies the region in which to execute the job.\n # If omitted, this inherits the default region name of \"global\".\n # region = \"global\"\n #\n # The \"datacenters\" parameter specifies the list of datacenters which should\n # be considered when placing this task. This must be provided.\n datacenters = \"yul1\"\n\n # The \"type\" parameter controls the type of job, which impacts the scheduler's\n # decision on placement. This configuration is optional and defaults to\n # \"service\". For a full list of job types and their differences, please see\n # the online documentation.\n #\n # For more information, please see the online documentation at:\n #\n # https://www.nomadproject.io/docs/jobspec/schedulers\n #\n type = \"service\"\n\n update {\n # The \"max_parallel\" parameter specifies the maximum number of updates to\n # perform in parallel. In this case, this specifies to update a single task\n # at a time.\n max_parallel = 1\n\n health_check = \"checks\"\n\n # The \"min_healthy_time\" parameter specifies the minimum time the allocation\n # must be in the healthy state before it is marked as healthy and unblocks\n # further allocations from being updated.\n min_healthy_time = \"10s\"\n\n # The \"healthy_deadline\" parameter specifies the deadline in which the\n # allocation must be marked as healthy after which the allocation is\n # automatically transitioned to unhealthy. Transitioning to unhealthy will\n # fail the deployment and potentially roll back the job if \"auto_revert\" is\n # set to true.\n healthy_deadline = \"3m\"\n\n # The \"progress_deadline\" parameter specifies the deadline in which an\n # allocation must be marked as healthy. The deadline begins when the first\n # allocation for the deployment is created and is reset whenever an allocation\n # as part of the deployment transitions to a healthy state. If no allocation\n # transitions to the healthy state before the progress deadline, the\n # deployment is marked as failed.\n progress_deadline = \"10m\"\n\n\n # The \"canary\" parameter specifies that changes to the job that would result\n # in destructive updates should create the specified number of canaries\n # without stopping any previous allocations. Once the operator determines the\n # canaries are healthy, they can be promoted which unblocks a rolling update\n # of the remaining allocations at a rate of \"max_parallel\".\n #\n # Further, setting \"canary\" equal to the count of the task group allows\n # blue/green deployments. When the job is updated, a full set of the new\n # version is deployed and upon promotion the old version is stopped.\n canary = 1\n\n # Specifies if the job should auto-promote to the canary version when all\n # canaries become healthy during a deployment. Defaults to false which means\n # canaries must be manually updated with the nomad deployment promote\n # command.\n auto_promote = true\n\n # The \"auto_revert\" parameter specifies if the job should auto-revert to the\n # last stable job on deployment failure. A job is marked as stable if all the\n # allocations as part of its deployment were marked healthy.\n auto_revert = true\n\n }\n\n # The \"group\" stanza defines a series of tasks that should be co-located on\n # the same Nomad client. Any task within a group will be placed on the same\n # client.\n #\n # For more information and examples on the \"group\" stanza, please see\n # the online documentation at:\n #\n # https://www.nomadproject.io/docs/job-specification/group\n #\n group \"prod-group1-prometheus\" {\n # The \"count\" parameter specifies the number of the task groups that should\n # be running under this group. This value must be non-negative and defaults\n # to 1.\n count = 4\n\n # The volume stanza allows the group to specify that it requires a given\n # volume from the cluster.\n #\n # For more information and examples on the \"volume\" stanza, please see\n # the online documentation at:\n #\n # https://www.nomadproject.io/docs/job-specification/volume\n #\n \n volume \"prod-volume1-prometheus\" {\n type = \"host\"\n read_only = false\n source = \"prod-volume-data1-1\"\n }\n \n\n # The constraint allows restricting the set of eligible nodes. Constraints\n # may filter on attributes or client metadata.\n #\n # For more information and examples on the \"volume\" stanza, please see\n # the online documentation at:\n #\n # https://www.nomadproject.io/docs/job-specification/constraint\n #\n constraint {\n attribute = \"${attr.cpu.arch}\"\n operator = \"!=\"\n value = \"arm64\"\n }\n\n # The \"task\" stanza creates an individual unit of work, such as a Docker\n # container, web application, or batch processing.\n #\n # For more information and examples on the \"task\" stanza, please see\n # the online documentation at:\n #\n # https://www.nomadproject.io/docs/job-specification/task\n #\n task \"prod-task1-prometheus\" {\n # The \"driver\" parameter specifies the task driver that should be used to\n # run the task.\n driver = \"exec\"\n\n \n volume_mount {\n volume = \"prod-volume1-prometheus\"\n destination = \"/data/\"\n read_only = false\n }\n \n\n \n\n # The \"config\" stanza specifies the driver configuration, which is passed\n # directly to the driver to start the task. The details of configurations\n # are specific to each driver, so please see specific driver\n # documentation for more information.\n config {\n command = \"local/prometheus-2.24.0.linux-amd64/prometheus\"\n args = [\n \"--config.file=secrets/prometheus.yml\",\n \"--storage.tsdb.path=/data/prometheus/\",\n \"--storage.tsdb.retention.time=15d\"\n ]\n }\n\n # The artifact stanza instructs Nomad to fetch and unpack a remote resource,\n # such as a file, tarball, or binary. Nomad downloads artifacts using the\n # popular go-getter library, which permits downloading artifacts from a\n # variety of locations using a URL as the input source.\n #\n # For more information and examples on the \"artifact\" stanza, please see\n # the online documentation at:\n #\n # https://www.nomadproject.io/docs/job-specification/artifact\n #\n artifact {\n source = \"https://github.com/prometheus/prometheus/releases/download/v2.24.0/prometheus-2.24.0.linux-amd64.tar.gz\"\n }\n\n # The \"template\" stanza instructs Nomad to manage a template, such as\n # a configuration file or script. This template can optionally pull data\n # from Consul or Vault to populate runtime configuration data.\n #\n # For more information and examples on the \"template\" stanza, please see\n # the online documentation at:\n #\n # https://www.nomadproject.io/docs/job-specification/template\n #\n template {\n change_mode = \"noop\"\n change_signal = \"SIGINT\"\n destination = \"secrets/alerts.yml\"\n left_delimiter = \"{{{\"\n right_delimiter = \"}}}\"\n data = \u003c\u003cEOH\n---\ngroups:\n- name: \"Consul\"\n rules:\n - alert: ConsulServiceHealthcheckFailed\n expr: consul_catalog_service_node_healthy == 0\n for: 0m\n labels:\n severity: critical\n annotations:\n summary: \"Consul service healthcheck failed (instance {{ $labels.instance }}).\"\n description: \"Service: `{{ $labels.service_name }}` Healthcheck: `{{ $labels.service_id }}`.\"\n - alert: ConsulMissingMasterNode\n expr: consul_raft_peers \u003c 3\n for: 0m\n labels:\n severity: critical\n annotations:\n summary: \"Consul missing master node (instance {{ $labels.instance }}).\"\n description: \"Numbers of consul raft peers should be 3, in order to preserve quorum.\"\n - alert: ConsulAgentUnhealthy\n expr: consul_health_node_status{status=\"critical\"} == 1\n for: 0m\n labels:\n severity: critical\n annotations:\n summary: \"Consul agent unhealthy (instance {{ $labels.instance }}).\"\n description: \"A Consul agent is down.\"\n- name: \"Hosts\"\n rules:\n - alert: NodeDown\n expr: up == 0\n for: 0m\n labels:\n severity: critical\n annotations:\n summary: \"Prometheus target missing (instance {{ $labels.instance }}).\"\n description: \"A Prometheus target has disappeared. An exporter might be crashed.\"\n - alert: HostHighCpuLoad\n expr: 100 - (avg by(instance) (rate(node_cpu_seconds_total{mode=\"idle\"}[2m])) * 100) \u003e 80\n for: 0m\n labels:\n severity: warning\n annotations:\n summary: \"Host high CPU load (instance {{ $labels.instance }}).\"\n description: \"CPU load is \u003e 80%.\"\n - alert: HostOutOfMemory\n expr: node_memory_MemAvailable_bytes / node_memory_MemTotal_bytes * 100 \u003c 10\n for: 2m\n labels:\n severity: warning\n annotations:\n summary: \"Host out of memory (instance {{ $labels.instance }}).\"\n description: \"Node memory is filling up (\u003c 10% left).\"\n - alert: HostOomKillDetected\n expr: increase(node_vmstat_oom_kill[1m]) \u003e 0\n for: 0m\n labels:\n severity: warning\n annotations:\n summary: \"Host OOM kill detected (instance {{ $labels.instance }}).\"\n description: \"OOM kill detected.\"\n - alert: HostMemoryUnderMemoryPressure\n expr: rate(node_vmstat_pgmajfault[1m]) \u003e 1000\n for: 2m\n labels:\n severity: warning\n annotations:\n summary: \"Host memory under memory pressure (instance {{ $labels.instance }}).\"\n description: \"The node is under heavy memory pressure. High rate of major page faults.\"\n - alert: HostOutOfDiskSpace\n expr: (node_filesystem_avail_bytes * 100) / node_filesystem_size_bytes \u003c 10 and ON (instance, device, mountpoint) node_filesystem_readonly == 0\n for: 2m\n labels:\n severity: warning\n annotations:\n summary: \"Host out of disk space (instance {{ $labels.instance }}).\"\n description: \"Disk is almost full (\u003c 10% left).\"\n - alert: HostRaidDiskFailure\n expr: node_md_disks{state=\"failed\"} \u003e 0\n for: 2m\n labels:\n severity: warning\n annotations:\n summary: \"Host RAID disk failure (instance {{ $labels.instance }}).\"\n description: \"At least one device in RAID array on {{ $labels.instance }} failed. Array {{ $labels.md_device }} needs attention and possibly a disk swap.\"\n - alert: HostConntrackLimit\n expr: node_nf_conntrack_entries / node_nf_conntrack_entries_limit \u003e 0.8\n for: 5m\n labels:\n severity: warning\n annotations:\n summary: \"Host conntrack limit (instance {{ $labels.instance }}).\"\n description: \"The number of conntrack is approching limit.\"\n - alert: HostNetworkInterfaceSaturated\n expr: (rate(node_network_receive_bytes_total{device!~\"^tap.*\"}[1m]) + rate(node_network_transmit_bytes_total{device!~\"^tap.*\"}[1m])) / node_network_speed_bytes{device!~\"^tap.*\"} \u003e 0.8\n for: 1m\n labels:\n severity: warning\n annotations:\n summary: \"Host Network Interface Saturated (instance {{ $labels.instance }}).\"\n description: \"The network interface {{ $labels.interface }} on {{ $labels.instance }} is getting overloaded.\"\n - alert: HostSystemdServiceCrashed\n expr: node_systemd_unit_state{state=\"failed\"} == 1\n for: 0m\n labels:\n severity: warning\n annotations:\n summary: \"Host SystemD service crashed (instance {{ $labels.instance }}).\"\n description: \"SystemD service crashed.\"\n - alert: HostEdacCorrectableErrorsDetected\n expr: increase(node_edac_correctable_errors_total[1m]) \u003e 0\n for: 0m\n labels:\n severity: info\n annotations:\n summary: \"Host EDAC Correctable Errors detected (instance {{ $labels.instance }}).\"\n description: '{{ $labels.instance }} has had {{ printf \"%.0f\" $value }} correctable memory errors reported by EDAC in the last 5 minutes.'\n - alert: HostEdacUncorrectableErrorsDetected\n expr: node_edac_uncorrectable_errors_total \u003e 0\n for: 0m\n labels:\n severity: warning\n annotations:\n summary: \"Host EDAC Uncorrectable Errors detected (instance {{ $labels.instance }}).\"\n description: '{{ $labels.instance }} has had {{ printf \"%.0f\" $value }} uncorrectable memory errors reported by EDAC in the last 5 minutes.'\n- name: \"Min.io\"\n rules:\n - alert: MinioDiskOffline\n expr: minio_offline_disks \u003e 0\n for: 0m\n labels:\n severity: critical\n annotations:\n summary: \"Minio disk offline (instance {{ $labels.instance }})\"\n description: \"Minio disk is offline.\"\n - alert: MinioStorageSpaceExhausted\n expr: minio_disk_storage_free_bytes / 1024 / 1024 / 1024 \u003c 10\n for: 2m\n labels:\n severity: warning\n annotations:\n summary: \"Minio storage space exhausted (instance {{ $labels.instance }}).\"\n description: \"Minio storage space is low (\u003c 10 GB).\"\n- name: \"Prometheus\"\n rules:\n - alert: PrometheusConfigurationReloadFailure\n expr: prometheus_config_last_reload_successful != 1\n for: 0m\n labels:\n severity: warning\n annotations:\n summary: \"Prometheus configuration reload failure (instance {{ $labels.instance }}).\"\n description: \"Prometheus configuration reload error.\"\n - alert: PrometheusTooManyRestarts\n expr: changes(process_start_time_seconds{job=~\"prometheus|pushgateway|alertmanager\"}[15m]) \u003e 2\n for: 0m\n labels:\n severity: warning\n annotations:\n summary: \"Prometheus too many restarts (instance {{ $labels.instance }}).\"\n description: \"Prometheus has restarted more than twice in the last 15 minutes. It might be crashlooping.\"\n - alert: PrometheusAlertmanagerConfigurationReloadFailure\n expr: alertmanager_config_last_reload_successful != 1\n for: 0m\n labels:\n severity: warning\n annotations:\n summary: \"Prometheus AlertManager configuration reload failure (instance {{ $labels.instance }}).\"\n description: \"AlertManager configuration reload error.\"\n - alert: PrometheusRuleEvaluationFailures\n expr: increase(prometheus_rule_evaluation_failures_total[3m]) \u003e 0\n for: 0m\n labels:\n severity: critical\n annotations:\n summary: \"Prometheus rule evaluation failures (instance {{ $labels.instance }}).\"\n description: \"Prometheus encountered {{ $value }} rule evaluation failures, leading to potentially ignored alerts.\"\n - alert: PrometheusTargetScrapingSlow\n expr: prometheus_target_interval_length_seconds{quantile=\"0.9\"} \u003e 60\n for: 5m\n labels:\n severity: warning\n annotations:\n summary: \"Prometheus target scraping slow (instance {{ $labels.instance }}).\"\n description: \"Prometheus is scraping exporters slowly.\"\n - alert: PrometheusTsdbCompactionsFailed\n expr: increase(prometheus_tsdb_compactions_failed_total[1m]) \u003e 0\n for: 0m\n labels:\n severity: critical\n annotations:\n summary: \"Prometheus TSDB compactions failed (instance {{ $labels.instance }}).\"\n description: \"Prometheus encountered {{ $value }} TSDB compactions failures.\"\n - alert: PrometheusTsdbHeadTruncationsFailed\n expr: increase(prometheus_tsdb_head_truncations_failed_total[1m]) \u003e 0\n for: 0m\n labels:\n severity: critical\n annotations:\n summary: \"Prometheus TSDB head truncations failed (instance {{ $labels.instance }}).\"\n description: \"Prometheus encountered {{ $value }} TSDB head truncation failures.\"\n - alert: PrometheusTsdbWalCorruptions\n expr: increase(prometheus_tsdb_wal_corruptions_total[1m]) \u003e 0\n for: 0m\n labels:\n severity: critical\n annotations:\n summary: \"Prometheus TSDB WAL corruptions (instance {{ $labels.instance }}).\"\n description: \"Prometheus encountered {{ $value }} TSDB WAL corruptions.\"\n - alert: PrometheusTsdbWalTruncationsFailed\n expr: increase(prometheus_tsdb_wal_truncations_failed_total[1m]) \u003e 0\n for: 0m\n labels:\n severity: critical\n annotations:\n summary: \"Prometheus TSDB WAL truncations failed (instance {{ $labels.instance }}).\"\n description: \"Prometheus encountered {{ $value }} TSDB WAL truncation failures.\"\nEOH\n }\n\n template {\n change_mode = \"noop\"\n change_signal = \"SIGINT\"\n destination = \"secrets/prometheus.yml\"\n data = \u003c\u003cEOH\n---\nglobal:\n scrape_interval: 5s\n scrape_timeout: 5s\n evaluation_interval: 5s\n\nalerting:\n alertmanagers:\n - consul_sd_configs:\n - server: '{{ env \"NOMAD_IP_prometheus\" }}:8500'\n services: [ 'alertmanager' ]\n\nrule_files:\n - 'alerts.yml'\n\nscrape_configs:\n\n - job_name: 'Nomad Cluster'\n consul_sd_configs:\n - server: '{{ env \"NOMAD_IP_prometheus\" }}:8500'\n services: [ 'nomad-client', 'nomad' ]\n relabel_configs:\n - source_labels: [__meta_consul_tags]\n regex: '(.*)http(.*)'\n action: keep\n metrics_path: /v1/metrics\n params:\n format: [ 'prometheus' ]\n\n - job_name: 'Consul Cluster'\n static_configs:\n - targets: [ '10.30.51.30:8500', '10.30.51.32:8500', '10.30.51.33:8500' ]\n metrics_path: /v1/agent/metrics\n params:\n format: [ 'prometheus' ]\n\n - job_name: 'Alertmanager'\n consul_sd_configs:\n - server: '{{ env \"NOMAD_IP_prometheus\" }}:8500'\n services: [ 'alertmanager' ]\n\n - job_name: 'Blackbox Exporter (icmp)'\n static_configs:\n - targets: [ 'gerrit.fd.io' ]\n - targets: [ 'jenkins.fd.io' ]\n - targets: [ '10.30.51.32' ]\n params:\n module: [ 'icmp_v4' ]\n relabel_configs:\n - source_labels: [__address__]\n target_label: __param_target\n - source_labels: [__param_target]\n target_label: instance\n - target_label: __address__\n replacement: localhost:9115\n metrics_path: /probe\n\n - job_name: 'Blackbox Exporter (http)'\n static_configs:\n - targets: [ 'gerrit.fd.io' ]\n - targets: [ 'jenkins.fd.io' ]\n params:\n module: [ 'http_2xx' ]\n relabel_configs:\n - source_labels: [__address__]\n target_label: __param_target\n - source_labels: [__param_target]\n target_label: instance\n - target_label: __address__\n replacement: localhost:9115\n metrics_path: /probe\n\n - job_name: 'cAdvisor Exporter'\n consul_sd_configs:\n - server: '{{ env \"NOMAD_IP_prometheus\" }}:8500'\n services: [ 'cadvisorexporter' ]\n\n - job_name: 'Grafana'\n consul_sd_configs:\n - server: '{{ env \"NOMAD_IP_prometheus\" }}:8500'\n services: [ 'grafana' ]\n\n - job_name: 'Node Exporter'\n consul_sd_configs:\n - server: '{{ env \"NOMAD_IP_prometheus\" }}:8500'\n services: [ 'nodeexporter' ]\n\n - job_name: 'Prometheus'\n consul_sd_configs:\n - server: '{{ env \"NOMAD_IP_prometheus\" }}:8500'\n services: [ 'prometheus' ]\n\n - job_name: 'Minio'\n bearer_token: eyJhbGciOiJIUzUxMiIsInR5cCI6IkpXVCJ9.eyJleHAiOjQ3NjQ1ODEzMzcsImlzcyI6InByb21ldGhldXMiLCJzdWIiOiJtaW5pbyJ9.oeTw3EIaiFmlDikrHXWiWXMH2vxLfDLkfjEC7G2N3M_keH_xyA_l2ofLLNYtopa_3GCEZnxLQdPuFZrmgpkDWg\n consul_sd_configs:\n - server: '{{ env \"NOMAD_IP_prometheus\" }}:8500'\n services: [ 'storage' ]\n metrics_path: /minio/prometheus/metrics\nEOH\n }\n\n # The service stanza instructs Nomad to register a service with Consul.\n #\n # For more information and examples on the \"task\" stanza, please see\n # the online documentation at:\n #\n # https://www.nomadproject.io/docs/job-specification/service\n #\n service {\n name = \"prometheus\"\n port = \"prometheus\"\n tags = [ \"prometheus${NOMAD_ALLOC_INDEX}\" ]\n check {\n name = \"Prometheus Check Live\"\n type = \"http\"\n path = \"/-/healthy\"\n interval = \"10s\"\n timeout = \"2s\"\n }\n }\n\n # The \"resources\" stanza describes the requirements a task needs to\n # execute. Resource requirements include memory, network, cpu, and more.\n # This ensures the task will execute on a machine that contains enough\n # resource capacity.\n #\n # For more information and examples on the \"resources\" stanza, please see\n # the online documentation at:\n #\n # https://www.nomadproject.io/docs/job-specification/resources\n #\n resources {\n cpu = 2000\n memory = 8192\n # The network stanza specifies the networking requirements for the task\n # group, including the network mode and port allocations. When scheduling\n # jobs in Nomad they are provisioned across your fleet of machines along\n # with other jobs and services. Because you don't know in advance what host\n # your job will be provisioned on, Nomad will provide your tasks with\n # network configuration when they start up.\n #\n # For more information and examples on the \"template\" stanza, please see\n # the online documentation at:\n #\n # https://www.nomadproject.io/docs/job-specification/network\n #\n network {\n port \"prometheus\" {\n static = 9090\n }\n }\n }\n }\n }\n}",
+ "template": "job \"${job_name}\" {\n # The \"region\" parameter specifies the region in which to execute the job.\n # If omitted, this inherits the default region name of \"global\".\n # region = \"global\"\n #\n # The \"datacenters\" parameter specifies the list of datacenters which should\n # be considered when placing this task. This must be provided.\n datacenters = \"${datacenters}\"\n\n # The \"type\" parameter controls the type of job, which impacts the scheduler's\n # decision on placement. This configuration is optional and defaults to\n # \"service\". For a full list of job types and their differences, please see\n # the online documentation.\n #\n # For more information, please see the online documentation at:\n #\n # https://www.nomadproject.io/docs/jobspec/schedulers\n #\n type = \"service\"\n\n update {\n # The \"max_parallel\" parameter specifies the maximum number of updates to\n # perform in parallel. In this case, this specifies to update a single task\n # at a time.\n max_parallel = 1\n\n health_check = \"checks\"\n\n # The \"min_healthy_time\" parameter specifies the minimum time the allocation\n # must be in the healthy state before it is marked as healthy and unblocks\n # further allocations from being updated.\n min_healthy_time = \"10s\"\n\n # The \"healthy_deadline\" parameter specifies the deadline in which the\n # allocation must be marked as healthy after which the allocation is\n # automatically transitioned to unhealthy. Transitioning to unhealthy will\n # fail the deployment and potentially roll back the job if \"auto_revert\" is\n # set to true.\n healthy_deadline = \"3m\"\n\n # The \"progress_deadline\" parameter specifies the deadline in which an\n # allocation must be marked as healthy. The deadline begins when the first\n # allocation for the deployment is created and is reset whenever an allocation\n # as part of the deployment transitions to a healthy state. If no allocation\n # transitions to the healthy state before the progress deadline, the\n # deployment is marked as failed.\n progress_deadline = \"10m\"\n\n%{ if use_canary }\n # The \"canary\" parameter specifies that changes to the job that would result\n # in destructive updates should create the specified number of canaries\n # without stopping any previous allocations. Once the operator determines the\n # canaries are healthy, they can be promoted which unblocks a rolling update\n # of the remaining allocations at a rate of \"max_parallel\".\n #\n # Further, setting \"canary\" equal to the count of the task group allows\n # blue/green deployments. When the job is updated, a full set of the new\n # version is deployed and upon promotion the old version is stopped.\n canary = 1\n\n # Specifies if the job should auto-promote to the canary version when all\n # canaries become healthy during a deployment. Defaults to false which means\n # canaries must be manually updated with the nomad deployment promote\n # command.\n auto_promote = true\n\n # The \"auto_revert\" parameter specifies if the job should auto-revert to the\n # last stable job on deployment failure. A job is marked as stable if all the\n # allocations as part of its deployment were marked healthy.\n auto_revert = true\n%{ endif }\n }\n\n # The \"group\" stanza defines a series of tasks that should be co-located on\n # the same Nomad client. Any task within a group will be placed on the same\n # client.\n #\n # For more information and examples on the \"group\" stanza, please see\n # the online documentation at:\n #\n # https://www.nomadproject.io/docs/job-specification/group\n #\n group \"prod-group1-${service_name}\" {\n # The \"count\" parameter specifies the number of the task groups that should\n # be running under this group. This value must be non-negative and defaults\n # to 1.\n count = ${group_count}\n\n # The volume stanza allows the group to specify that it requires a given\n # volume from the cluster.\n #\n # For more information and examples on the \"volume\" stanza, please see\n # the online documentation at:\n #\n # https://www.nomadproject.io/docs/job-specification/volume\n #\n %{ if use_host_volume }\n volume \"prod-volume1-${service_name}\" {\n type = \"host\"\n read_only = false\n source = \"${host_volume}\"\n }\n %{ endif }\n\n # The constraint allows restricting the set of eligible nodes. Constraints\n # may filter on attributes or client metadata.\n #\n # For more information and examples on the \"volume\" stanza, please see\n # the online documentation at:\n #\n # https://www.nomadproject.io/docs/job-specification/constraint\n #\n constraint {\n attribute = \"$${attr.cpu.arch}\"\n operator = \"!=\"\n value = \"arm64\"\n }\n\n # The \"task\" stanza creates an individual unit of work, such as a Docker\n # container, web application, or batch processing.\n #\n # For more information and examples on the \"task\" stanza, please see\n # the online documentation at:\n #\n # https://www.nomadproject.io/docs/job-specification/task\n #\n task \"prod-task1-${service_name}\" {\n # The \"driver\" parameter specifies the task driver that should be used to\n # run the task.\n driver = \"exec\"\n\n %{ if use_host_volume }\n volume_mount {\n volume = \"prod-volume1-${service_name}\"\n destination = \"${data_dir}\"\n read_only = false\n }\n %{ endif }\n\n %{ if use_vault_provider }\n vault {\n policies = \"${vault_kv_policy_name}\"\n }\n %{ endif }\n\n # The \"config\" stanza specifies the driver configuration, which is passed\n # directly to the driver to start the task. The details of configurations\n # are specific to each driver, so please see specific driver\n # documentation for more information.\n config {\n command = \"local/prometheus-${version}.linux-amd64/prometheus\"\n args = [\n \"--config.file=secrets/prometheus.yml\",\n \"--storage.tsdb.path=${data_dir}prometheus/\",\n \"--storage.tsdb.retention.time=15d\"\n ]\n }\n\n # The artifact stanza instructs Nomad to fetch and unpack a remote resource,\n # such as a file, tarball, or binary. Nomad downloads artifacts using the\n # popular go-getter library, which permits downloading artifacts from a\n # variety of locations using a URL as the input source.\n #\n # For more information and examples on the \"artifact\" stanza, please see\n # the online documentation at:\n #\n # https://www.nomadproject.io/docs/job-specification/artifact\n #\n artifact {\n source = \"${url}\"\n }\n\n # The \"template\" stanza instructs Nomad to manage a template, such as\n # a configuration file or script. This template can optionally pull data\n # from Consul or Vault to populate runtime configuration data.\n #\n # For more information and examples on the \"template\" stanza, please see\n # the online documentation at:\n #\n # https://www.nomadproject.io/docs/job-specification/template\n #\n template {\n change_mode = \"noop\"\n change_signal = \"SIGINT\"\n destination = \"secrets/alerts.yml\"\n left_delimiter = \"{{{\"\n right_delimiter = \"}}}\"\n data = \u003c\u003cEOH\n---\ngroups:\n- name: \"Consul\"\n rules:\n - alert: ConsulServiceHealthcheckFailed\n expr: consul_catalog_service_node_healthy == 0\n for: 0m\n labels:\n severity: critical\n annotations:\n summary: \"Consul service healthcheck failed (instance {{ $labels.instance }}).\"\n description: \"Service: `{{ $labels.service_name }}` Healthcheck: `{{ $labels.service_id }}`.\"\n - alert: ConsulMissingMasterNode\n expr: consul_raft_peers \u003c 3\n for: 0m\n labels:\n severity: critical\n annotations:\n summary: \"Consul missing master node (instance {{ $labels.instance }}).\"\n description: \"Numbers of consul raft peers should be 3, in order to preserve quorum.\"\n - alert: ConsulAgentUnhealthy\n expr: consul_health_node_status{status=\"critical\"} == 1\n for: 0m\n labels:\n severity: critical\n annotations:\n summary: \"Consul agent unhealthy (instance {{ $labels.instance }}).\"\n description: \"A Consul agent is down.\"\n- name: \"Hosts\"\n rules:\n - alert: NodeDown\n expr: up == 0\n for: 0m\n labels:\n severity: critical\n annotations:\n summary: \"Prometheus target missing (instance {{ $labels.instance }}).\"\n description: \"A Prometheus target has disappeared. An exporter might be crashed.\"\n - alert: HostHighCpuLoad\n expr: 100 - (avg by(instance) (rate(node_cpu_seconds_total{mode=\"idle\"}[2m])) * 100) \u003e 80\n for: 0m\n labels:\n severity: warning\n annotations:\n summary: \"Host high CPU load (instance {{ $labels.instance }}).\"\n description: \"CPU load is \u003e 80%.\"\n - alert: HostOutOfMemory\n expr: node_memory_MemAvailable_bytes / node_memory_MemTotal_bytes * 100 \u003c 10\n for: 2m\n labels:\n severity: warning\n annotations:\n summary: \"Host out of memory (instance {{ $labels.instance }}).\"\n description: \"Node memory is filling up (\u003c 10% left).\"\n - alert: HostOomKillDetected\n expr: increase(node_vmstat_oom_kill[1m]) \u003e 0\n for: 0m\n labels:\n severity: warning\n annotations:\n summary: \"Host OOM kill detected (instance {{ $labels.instance }}).\"\n description: \"OOM kill detected.\"\n - alert: HostMemoryUnderMemoryPressure\n expr: rate(node_vmstat_pgmajfault[1m]) \u003e 1000\n for: 2m\n labels:\n severity: warning\n annotations:\n summary: \"Host memory under memory pressure (instance {{ $labels.instance }}).\"\n description: \"The node is under heavy memory pressure. High rate of major page faults.\"\n - alert: HostOutOfDiskSpace\n expr: (node_filesystem_avail_bytes * 100) / node_filesystem_size_bytes \u003c 10 and ON (instance, device, mountpoint) node_filesystem_readonly == 0\n for: 2m\n labels:\n severity: warning\n annotations:\n summary: \"Host out of disk space (instance {{ $labels.instance }}).\"\n description: \"Disk is almost full (\u003c 10% left).\"\n - alert: HostRaidDiskFailure\n expr: node_md_disks{state=\"failed\"} \u003e 0\n for: 2m\n labels:\n severity: warning\n annotations:\n summary: \"Host RAID disk failure (instance {{ $labels.instance }}).\"\n description: \"At least one device in RAID array on {{ $labels.instance }} failed. Array {{ $labels.md_device }} needs attention and possibly a disk swap.\"\n - alert: HostConntrackLimit\n expr: node_nf_conntrack_entries / node_nf_conntrack_entries_limit \u003e 0.8\n for: 5m\n labels:\n severity: warning\n annotations:\n summary: \"Host conntrack limit (instance {{ $labels.instance }}).\"\n description: \"The number of conntrack is approching limit.\"\n - alert: HostNetworkInterfaceSaturated\n expr: (rate(node_network_receive_bytes_total{device!~\"^tap.*\"}[1m]) + rate(node_network_transmit_bytes_total{device!~\"^tap.*\"}[1m])) / node_network_speed_bytes{device!~\"^tap.*\"} \u003e 0.8\n for: 1m\n labels:\n severity: warning\n annotations:\n summary: \"Host Network Interface Saturated (instance {{ $labels.instance }}).\"\n description: \"The network interface {{ $labels.interface }} on {{ $labels.instance }} is getting overloaded.\"\n - alert: HostSystemdServiceCrashed\n expr: node_systemd_unit_state{state=\"failed\"} == 1\n for: 0m\n labels:\n severity: warning\n annotations:\n summary: \"Host SystemD service crashed (instance {{ $labels.instance }}).\"\n description: \"SystemD service crashed.\"\n - alert: HostEdacCorrectableErrorsDetected\n expr: increase(node_edac_correctable_errors_total[1m]) \u003e 0\n for: 0m\n labels:\n severity: info\n annotations:\n summary: \"Host EDAC Correctable Errors detected (instance {{ $labels.instance }}).\"\n description: '{{ $labels.instance }} has had {{ printf \"%.0f\" $value }} correctable memory errors reported by EDAC in the last 5 minutes.'\n - alert: HostEdacUncorrectableErrorsDetected\n expr: node_edac_uncorrectable_errors_total \u003e 0\n for: 0m\n labels:\n severity: warning\n annotations:\n summary: \"Host EDAC Uncorrectable Errors detected (instance {{ $labels.instance }}).\"\n description: '{{ $labels.instance }} has had {{ printf \"%.0f\" $value }} uncorrectable memory errors reported by EDAC in the last 5 minutes.'\n- name: \"Min.io\"\n rules:\n - alert: MinioDiskOffline\n expr: minio_offline_disks \u003e 0\n for: 0m\n labels:\n severity: critical\n annotations:\n summary: \"Minio disk offline (instance {{ $labels.instance }})\"\n description: \"Minio disk is offline.\"\n - alert: MinioStorageSpaceExhausted\n expr: minio_disk_storage_free_bytes / 1024 / 1024 / 1024 \u003c 10\n for: 2m\n labels:\n severity: warning\n annotations:\n summary: \"Minio storage space exhausted (instance {{ $labels.instance }}).\"\n description: \"Minio storage space is low (\u003c 10 GB).\"\n- name: \"Prometheus\"\n rules:\n - alert: PrometheusConfigurationReloadFailure\n expr: prometheus_config_last_reload_successful != 1\n for: 0m\n labels:\n severity: warning\n annotations:\n summary: \"Prometheus configuration reload failure (instance {{ $labels.instance }}).\"\n description: \"Prometheus configuration reload error.\"\n - alert: PrometheusTooManyRestarts\n expr: changes(process_start_time_seconds{job=~\"prometheus|pushgateway|alertmanager\"}[15m]) \u003e 2\n for: 0m\n labels:\n severity: warning\n annotations:\n summary: \"Prometheus too many restarts (instance {{ $labels.instance }}).\"\n description: \"Prometheus has restarted more than twice in the last 15 minutes. It might be crashlooping.\"\n - alert: PrometheusAlertmanagerConfigurationReloadFailure\n expr: alertmanager_config_last_reload_successful != 1\n for: 0m\n labels:\n severity: warning\n annotations:\n summary: \"Prometheus AlertManager configuration reload failure (instance {{ $labels.instance }}).\"\n description: \"AlertManager configuration reload error.\"\n - alert: PrometheusRuleEvaluationFailures\n expr: increase(prometheus_rule_evaluation_failures_total[3m]) \u003e 0\n for: 0m\n labels:\n severity: critical\n annotations:\n summary: \"Prometheus rule evaluation failures (instance {{ $labels.instance }}).\"\n description: \"Prometheus encountered {{ $value }} rule evaluation failures, leading to potentially ignored alerts.\"\n - alert: PrometheusTargetScrapingSlow\n expr: prometheus_target_interval_length_seconds{quantile=\"0.9\"} \u003e 60\n for: 5m\n labels:\n severity: warning\n annotations:\n summary: \"Prometheus target scraping slow (instance {{ $labels.instance }}).\"\n description: \"Prometheus is scraping exporters slowly.\"\n - alert: PrometheusTsdbCompactionsFailed\n expr: increase(prometheus_tsdb_compactions_failed_total[1m]) \u003e 0\n for: 0m\n labels:\n severity: critical\n annotations:\n summary: \"Prometheus TSDB compactions failed (instance {{ $labels.instance }}).\"\n description: \"Prometheus encountered {{ $value }} TSDB compactions failures.\"\n - alert: PrometheusTsdbHeadTruncationsFailed\n expr: increase(prometheus_tsdb_head_truncations_failed_total[1m]) \u003e 0\n for: 0m\n labels:\n severity: critical\n annotations:\n summary: \"Prometheus TSDB head truncations failed (instance {{ $labels.instance }}).\"\n description: \"Prometheus encountered {{ $value }} TSDB head truncation failures.\"\n - alert: PrometheusTsdbWalCorruptions\n expr: increase(prometheus_tsdb_wal_corruptions_total[1m]) \u003e 0\n for: 0m\n labels:\n severity: critical\n annotations:\n summary: \"Prometheus TSDB WAL corruptions (instance {{ $labels.instance }}).\"\n description: \"Prometheus encountered {{ $value }} TSDB WAL corruptions.\"\n - alert: PrometheusTsdbWalTruncationsFailed\n expr: increase(prometheus_tsdb_wal_truncations_failed_total[1m]) \u003e 0\n for: 0m\n labels:\n severity: critical\n annotations:\n summary: \"Prometheus TSDB WAL truncations failed (instance {{ $labels.instance }}).\"\n description: \"Prometheus encountered {{ $value }} TSDB WAL truncation failures.\"\nEOH\n }\n\n template {\n change_mode = \"noop\"\n change_signal = \"SIGINT\"\n destination = \"secrets/prometheus.yml\"\n data = \u003c\u003cEOH\n---\nglobal:\n scrape_interval: 5s\n scrape_timeout: 5s\n evaluation_interval: 5s\n\nalerting:\n alertmanagers:\n - consul_sd_configs:\n - server: '{{ env \"NOMAD_IP_prometheus\" }}:8500'\n services: [ 'alertmanager' ]\n\nrule_files:\n - 'alerts.yml'\n\nscrape_configs:\n\n - job_name: 'Nomad Cluster'\n consul_sd_configs:\n - server: '{{ env \"NOMAD_IP_prometheus\" }}:8500'\n services: [ 'nomad-client', 'nomad' ]\n relabel_configs:\n - source_labels: [__meta_consul_tags]\n regex: '(.*)http(.*)'\n action: keep\n metrics_path: /v1/metrics\n params:\n format: [ 'prometheus' ]\n\n - job_name: 'Consul Cluster'\n static_configs:\n - targets: [ '10.30.51.30:8500', '10.30.51.32:8500', '10.30.51.33:8500' ]\n metrics_path: /v1/agent/metrics\n params:\n format: [ 'prometheus' ]\n\n - job_name: 'Alertmanager'\n consul_sd_configs:\n - server: '{{ env \"NOMAD_IP_prometheus\" }}:8500'\n services: [ 'alertmanager' ]\n\n - job_name: 'Blackbox Exporter (icmp)'\n static_configs:\n - targets: [ 'gerrit.fd.io' ]\n - targets: [ 'jenkins.fd.io' ]\n - targets: [ '10.30.51.32' ]\n params:\n module: [ 'icmp_v4' ]\n relabel_configs:\n - source_labels: [__address__]\n target_label: __param_target\n - source_labels: [__param_target]\n target_label: instance\n - target_label: __address__\n replacement: localhost:9115\n metrics_path: /probe\n\n - job_name: 'Blackbox Exporter (http)'\n static_configs:\n - targets: [ 'gerrit.fd.io' ]\n - targets: [ 'jenkins.fd.io' ]\n params:\n module: [ 'http_2xx' ]\n relabel_configs:\n - source_labels: [__address__]\n target_label: __param_target\n - source_labels: [__param_target]\n target_label: instance\n - target_label: __address__\n replacement: localhost:9115\n metrics_path: /probe\n\n - job_name: 'cAdvisor Exporter'\n consul_sd_configs:\n - server: '{{ env \"NOMAD_IP_prometheus\" }}:8500'\n services: [ 'cadvisorexporter' ]\n\n - job_name: 'Grafana'\n consul_sd_configs:\n - server: '{{ env \"NOMAD_IP_prometheus\" }}:8500'\n services: [ 'grafana' ]\n\n - job_name: 'Node Exporter'\n consul_sd_configs:\n - server: '{{ env \"NOMAD_IP_prometheus\" }}:8500'\n services: [ 'nodeexporter' ]\n\n - job_name: 'Prometheus'\n consul_sd_configs:\n - server: '{{ env \"NOMAD_IP_prometheus\" }}:8500'\n services: [ 'prometheus' ]\n\n - job_name: 'Minio'\n bearer_token: eyJhbGciOiJIUzUxMiIsInR5cCI6IkpXVCJ9.eyJleHAiOjQ3NjQ1ODEzMzcsImlzcyI6InByb21ldGhldXMiLCJzdWIiOiJtaW5pbyJ9.oeTw3EIaiFmlDikrHXWiWXMH2vxLfDLkfjEC7G2N3M_keH_xyA_l2ofLLNYtopa_3GCEZnxLQdPuFZrmgpkDWg\n consul_sd_configs:\n - server: '{{ env \"NOMAD_IP_prometheus\" }}:8500'\n services: [ 'storage' ]\n metrics_path: /minio/prometheus/metrics\nEOH\n }\n\n # The service stanza instructs Nomad to register a service with Consul.\n #\n # For more information and examples on the \"task\" stanza, please see\n # the online documentation at:\n #\n # https://www.nomadproject.io/docs/job-specification/service\n #\n service {\n name = \"${service_name}\"\n port = \"${service_name}\"\n tags = [ \"${service_name}$${NOMAD_ALLOC_INDEX}\" ]\n check {\n name = \"Prometheus Check Live\"\n type = \"http\"\n path = \"/-/healthy\"\n interval = \"10s\"\n timeout = \"2s\"\n }\n }\n\n # The \"resources\" stanza describes the requirements a task needs to\n # execute. Resource requirements include memory, network, cpu, and more.\n # This ensures the task will execute on a machine that contains enough\n # resource capacity.\n #\n # For more information and examples on the \"resources\" stanza, please see\n # the online documentation at:\n #\n # https://www.nomadproject.io/docs/job-specification/resources\n #\n resources {\n cpu = ${cpu}\n memory = ${mem}\n # The network stanza specifies the networking requirements for the task\n # group, including the network mode and port allocations. When scheduling\n # jobs in Nomad they are provisioned across your fleet of machines along\n # with other jobs and services. Because you don't know in advance what host\n # your job will be provisioned on, Nomad will provide your tasks with\n # network configuration when they start up.\n #\n # For more information and examples on the \"template\" stanza, please see\n # the online documentation at:\n #\n # https://www.nomadproject.io/docs/job-specification/network\n #\n network {\n port \"${service_name}\" {\n static = ${port}\n }\n }\n }\n }\n }\n}",
+ "vars": {
+ "cpu": "2000",
+ "data_dir": "/data/",
+ "datacenters": "yul1",
+ "group_count": "4",
+ "host_volume": "prod-volume-data1-1",
+ "job_name": "prod-prometheus",
+ "mem": "8192",
+ "port": "9090",
+ "service_name": "prometheus",
+ "url": "https://github.com/prometheus/prometheus/releases/download/v2.24.0/prometheus-2.24.0.linux-amd64.tar.gz",
+ "use_canary": "true",
+ "use_host_volume": "true",
+ "use_vault_provider": "false",
+ "version": "2.24.0"
+ }
+ },
+ "sensitive_attributes": []
+ }
+ ]
+ },
+ {
+ "module": "module.prometheus",
+ "mode": "managed",
+ "type": "nomad_job",
+ "name": "nomad_job_prometheus",
+ "provider": "provider[\"registry.terraform.io/hashicorp/nomad\"].yul1",
+ "instances": [
+ {
+ "schema_version": 0,
+ "attributes": {
+ "allocation_ids": [
+ "79c4c6b7-e8e4-79e7-9dae-9a26142cd94c",
+ "2e11f44e-814e-db74-a9ee-8bcbebb1fa0b",
+ "83f9c9aa-f4f2-b832-a937-aaee4181493e",
+ "231f1338-a871-b44e-d545-e8af01d7eebe"
+ ],
+ "datacenters": [
+ "yul1"
+ ],
+ "deployment_id": "4cbf8370-f2d7-16c2-d500-f6495d43537d",
+ "deployment_status": "successful",
+ "deregister_on_destroy": true,
+ "deregister_on_id_change": true,
+ "detach": false,
+ "id": "prod-prometheus",
+ "jobspec": "job \"prod-prometheus\" {\n # The \"region\" parameter specifies the region in which to execute the job.\n # If omitted, this inherits the default region name of \"global\".\n # region = \"global\"\n #\n # The \"datacenters\" parameter specifies the list of datacenters which should\n # be considered when placing this task. This must be provided.\n datacenters = \"yul1\"\n\n # The \"type\" parameter controls the type of job, which impacts the scheduler's\n # decision on placement. This configuration is optional and defaults to\n # \"service\". For a full list of job types and their differences, please see\n # the online documentation.\n #\n # For more information, please see the online documentation at:\n #\n # https://www.nomadproject.io/docs/jobspec/schedulers\n #\n type = \"service\"\n\n update {\n # The \"max_parallel\" parameter specifies the maximum number of updates to\n # perform in parallel. In this case, this specifies to update a single task\n # at a time.\n max_parallel = 1\n\n health_check = \"checks\"\n\n # The \"min_healthy_time\" parameter specifies the minimum time the allocation\n # must be in the healthy state before it is marked as healthy and unblocks\n # further allocations from being updated.\n min_healthy_time = \"10s\"\n\n # The \"healthy_deadline\" parameter specifies the deadline in which the\n # allocation must be marked as healthy after which the allocation is\n # automatically transitioned to unhealthy. Transitioning to unhealthy will\n # fail the deployment and potentially roll back the job if \"auto_revert\" is\n # set to true.\n healthy_deadline = \"3m\"\n\n # The \"progress_deadline\" parameter specifies the deadline in which an\n # allocation must be marked as healthy. The deadline begins when the first\n # allocation for the deployment is created and is reset whenever an allocation\n # as part of the deployment transitions to a healthy state. If no allocation\n # transitions to the healthy state before the progress deadline, the\n # deployment is marked as failed.\n progress_deadline = \"10m\"\n\n\n # The \"canary\" parameter specifies that changes to the job that would result\n # in destructive updates should create the specified number of canaries\n # without stopping any previous allocations. Once the operator determines the\n # canaries are healthy, they can be promoted which unblocks a rolling update\n # of the remaining allocations at a rate of \"max_parallel\".\n #\n # Further, setting \"canary\" equal to the count of the task group allows\n # blue/green deployments. When the job is updated, a full set of the new\n # version is deployed and upon promotion the old version is stopped.\n canary = 1\n\n # Specifies if the job should auto-promote to the canary version when all\n # canaries become healthy during a deployment. Defaults to false which means\n # canaries must be manually updated with the nomad deployment promote\n # command.\n auto_promote = true\n\n # The \"auto_revert\" parameter specifies if the job should auto-revert to the\n # last stable job on deployment failure. A job is marked as stable if all the\n # allocations as part of its deployment were marked healthy.\n auto_revert = true\n\n }\n\n # The \"group\" stanza defines a series of tasks that should be co-located on\n # the same Nomad client. Any task within a group will be placed on the same\n # client.\n #\n # For more information and examples on the \"group\" stanza, please see\n # the online documentation at:\n #\n # https://www.nomadproject.io/docs/job-specification/group\n #\n group \"prod-group1-prometheus\" {\n # The \"count\" parameter specifies the number of the task groups that should\n # be running under this group. This value must be non-negative and defaults\n # to 1.\n count = 4\n\n # The volume stanza allows the group to specify that it requires a given\n # volume from the cluster.\n #\n # For more information and examples on the \"volume\" stanza, please see\n # the online documentation at:\n #\n # https://www.nomadproject.io/docs/job-specification/volume\n #\n \n volume \"prod-volume1-prometheus\" {\n type = \"host\"\n read_only = false\n source = \"prod-volume-data1-1\"\n }\n \n\n # The constraint allows restricting the set of eligible nodes. Constraints\n # may filter on attributes or client metadata.\n #\n # For more information and examples on the \"volume\" stanza, please see\n # the online documentation at:\n #\n # https://www.nomadproject.io/docs/job-specification/constraint\n #\n constraint {\n attribute = \"${attr.cpu.arch}\"\n operator = \"!=\"\n value = \"arm64\"\n }\n\n # The \"task\" stanza creates an individual unit of work, such as a Docker\n # container, web application, or batch processing.\n #\n # For more information and examples on the \"task\" stanza, please see\n # the online documentation at:\n #\n # https://www.nomadproject.io/docs/job-specification/task\n #\n task \"prod-task1-prometheus\" {\n # The \"driver\" parameter specifies the task driver that should be used to\n # run the task.\n driver = \"exec\"\n\n \n volume_mount {\n volume = \"prod-volume1-prometheus\"\n destination = \"/data/\"\n read_only = false\n }\n \n\n \n\n # The \"config\" stanza specifies the driver configuration, which is passed\n # directly to the driver to start the task. The details of configurations\n # are specific to each driver, so please see specific driver\n # documentation for more information.\n config {\n command = \"local/prometheus-2.24.0.linux-amd64/prometheus\"\n args = [\n \"--config.file=secrets/prometheus.yml\",\n \"--storage.tsdb.path=/data/prometheus/\",\n \"--storage.tsdb.retention.time=15d\"\n ]\n }\n\n # The artifact stanza instructs Nomad to fetch and unpack a remote resource,\n # such as a file, tarball, or binary. Nomad downloads artifacts using the\n # popular go-getter library, which permits downloading artifacts from a\n # variety of locations using a URL as the input source.\n #\n # For more information and examples on the \"artifact\" stanza, please see\n # the online documentation at:\n #\n # https://www.nomadproject.io/docs/job-specification/artifact\n #\n artifact {\n source = \"https://github.com/prometheus/prometheus/releases/download/v2.24.0/prometheus-2.24.0.linux-amd64.tar.gz\"\n }\n\n # The \"template\" stanza instructs Nomad to manage a template, such as\n # a configuration file or script. This template can optionally pull data\n # from Consul or Vault to populate runtime configuration data.\n #\n # For more information and examples on the \"template\" stanza, please see\n # the online documentation at:\n #\n # https://www.nomadproject.io/docs/job-specification/template\n #\n template {\n change_mode = \"noop\"\n change_signal = \"SIGINT\"\n destination = \"secrets/alerts.yml\"\n left_delimiter = \"{{{\"\n right_delimiter = \"}}}\"\n data = \u003c\u003cEOH\n---\ngroups:\n- name: \"Consul\"\n rules:\n - alert: ConsulServiceHealthcheckFailed\n expr: consul_catalog_service_node_healthy == 0\n for: 0m\n labels:\n severity: critical\n annotations:\n summary: \"Consul service healthcheck failed (instance {{ $labels.instance }}).\"\n description: \"Service: `{{ $labels.service_name }}` Healthcheck: `{{ $labels.service_id }}`.\"\n - alert: ConsulMissingMasterNode\n expr: consul_raft_peers \u003c 3\n for: 0m\n labels:\n severity: critical\n annotations:\n summary: \"Consul missing master node (instance {{ $labels.instance }}).\"\n description: \"Numbers of consul raft peers should be 3, in order to preserve quorum.\"\n - alert: ConsulAgentUnhealthy\n expr: consul_health_node_status{status=\"critical\"} == 1\n for: 0m\n labels:\n severity: critical\n annotations:\n summary: \"Consul agent unhealthy (instance {{ $labels.instance }}).\"\n description: \"A Consul agent is down.\"\n- name: \"Hosts\"\n rules:\n - alert: NodeDown\n expr: up == 0\n for: 0m\n labels:\n severity: critical\n annotations:\n summary: \"Prometheus target missing (instance {{ $labels.instance }}).\"\n description: \"A Prometheus target has disappeared. An exporter might be crashed.\"\n - alert: HostHighCpuLoad\n expr: 100 - (avg by(instance) (rate(node_cpu_seconds_total{mode=\"idle\"}[2m])) * 100) \u003e 80\n for: 0m\n labels:\n severity: warning\n annotations:\n summary: \"Host high CPU load (instance {{ $labels.instance }}).\"\n description: \"CPU load is \u003e 80%.\"\n - alert: HostOutOfMemory\n expr: node_memory_MemAvailable_bytes / node_memory_MemTotal_bytes * 100 \u003c 10\n for: 2m\n labels:\n severity: warning\n annotations:\n summary: \"Host out of memory (instance {{ $labels.instance }}).\"\n description: \"Node memory is filling up (\u003c 10% left).\"\n - alert: HostOomKillDetected\n expr: increase(node_vmstat_oom_kill[1m]) \u003e 0\n for: 0m\n labels:\n severity: warning\n annotations:\n summary: \"Host OOM kill detected (instance {{ $labels.instance }}).\"\n description: \"OOM kill detected.\"\n - alert: HostMemoryUnderMemoryPressure\n expr: rate(node_vmstat_pgmajfault[1m]) \u003e 1000\n for: 2m\n labels:\n severity: warning\n annotations:\n summary: \"Host memory under memory pressure (instance {{ $labels.instance }}).\"\n description: \"The node is under heavy memory pressure. High rate of major page faults.\"\n - alert: HostOutOfDiskSpace\n expr: (node_filesystem_avail_bytes * 100) / node_filesystem_size_bytes \u003c 10 and ON (instance, device, mountpoint) node_filesystem_readonly == 0\n for: 2m\n labels:\n severity: warning\n annotations:\n summary: \"Host out of disk space (instance {{ $labels.instance }}).\"\n description: \"Disk is almost full (\u003c 10% left).\"\n - alert: HostRaidDiskFailure\n expr: node_md_disks{state=\"failed\"} \u003e 0\n for: 2m\n labels:\n severity: warning\n annotations:\n summary: \"Host RAID disk failure (instance {{ $labels.instance }}).\"\n description: \"At least one device in RAID array on {{ $labels.instance }} failed. Array {{ $labels.md_device }} needs attention and possibly a disk swap.\"\n - alert: HostConntrackLimit\n expr: node_nf_conntrack_entries / node_nf_conntrack_entries_limit \u003e 0.8\n for: 5m\n labels:\n severity: warning\n annotations:\n summary: \"Host conntrack limit (instance {{ $labels.instance }}).\"\n description: \"The number of conntrack is approching limit.\"\n - alert: HostNetworkInterfaceSaturated\n expr: (rate(node_network_receive_bytes_total{device!~\"^tap.*\"}[1m]) + rate(node_network_transmit_bytes_total{device!~\"^tap.*\"}[1m])) / node_network_speed_bytes{device!~\"^tap.*\"} \u003e 0.8\n for: 1m\n labels:\n severity: warning\n annotations:\n summary: \"Host Network Interface Saturated (instance {{ $labels.instance }}).\"\n description: \"The network interface {{ $labels.interface }} on {{ $labels.instance }} is getting overloaded.\"\n - alert: HostSystemdServiceCrashed\n expr: node_systemd_unit_state{state=\"failed\"} == 1\n for: 0m\n labels:\n severity: warning\n annotations:\n summary: \"Host SystemD service crashed (instance {{ $labels.instance }}).\"\n description: \"SystemD service crashed.\"\n - alert: HostEdacCorrectableErrorsDetected\n expr: increase(node_edac_correctable_errors_total[1m]) \u003e 0\n for: 0m\n labels:\n severity: info\n annotations:\n summary: \"Host EDAC Correctable Errors detected (instance {{ $labels.instance }}).\"\n description: '{{ $labels.instance }} has had {{ printf \"%.0f\" $value }} correctable memory errors reported by EDAC in the last 5 minutes.'\n - alert: HostEdacUncorrectableErrorsDetected\n expr: node_edac_uncorrectable_errors_total \u003e 0\n for: 0m\n labels:\n severity: warning\n annotations:\n summary: \"Host EDAC Uncorrectable Errors detected (instance {{ $labels.instance }}).\"\n description: '{{ $labels.instance }} has had {{ printf \"%.0f\" $value }} uncorrectable memory errors reported by EDAC in the last 5 minutes.'\n- name: \"Min.io\"\n rules:\n - alert: MinioDiskOffline\n expr: minio_offline_disks \u003e 0\n for: 0m\n labels:\n severity: critical\n annotations:\n summary: \"Minio disk offline (instance {{ $labels.instance }})\"\n description: \"Minio disk is offline.\"\n - alert: MinioStorageSpaceExhausted\n expr: minio_disk_storage_free_bytes / 1024 / 1024 / 1024 \u003c 10\n for: 2m\n labels:\n severity: warning\n annotations:\n summary: \"Minio storage space exhausted (instance {{ $labels.instance }}).\"\n description: \"Minio storage space is low (\u003c 10 GB).\"\n- name: \"Prometheus\"\n rules:\n - alert: PrometheusConfigurationReloadFailure\n expr: prometheus_config_last_reload_successful != 1\n for: 0m\n labels:\n severity: warning\n annotations:\n summary: \"Prometheus configuration reload failure (instance {{ $labels.instance }}).\"\n description: \"Prometheus configuration reload error.\"\n - alert: PrometheusTooManyRestarts\n expr: changes(process_start_time_seconds{job=~\"prometheus|pushgateway|alertmanager\"}[15m]) \u003e 2\n for: 0m\n labels:\n severity: warning\n annotations:\n summary: \"Prometheus too many restarts (instance {{ $labels.instance }}).\"\n description: \"Prometheus has restarted more than twice in the last 15 minutes. It might be crashlooping.\"\n - alert: PrometheusAlertmanagerConfigurationReloadFailure\n expr: alertmanager_config_last_reload_successful != 1\n for: 0m\n labels:\n severity: warning\n annotations:\n summary: \"Prometheus AlertManager configuration reload failure (instance {{ $labels.instance }}).\"\n description: \"AlertManager configuration reload error.\"\n - alert: PrometheusRuleEvaluationFailures\n expr: increase(prometheus_rule_evaluation_failures_total[3m]) \u003e 0\n for: 0m\n labels:\n severity: critical\n annotations:\n summary: \"Prometheus rule evaluation failures (instance {{ $labels.instance }}).\"\n description: \"Prometheus encountered {{ $value }} rule evaluation failures, leading to potentially ignored alerts.\"\n - alert: PrometheusTargetScrapingSlow\n expr: prometheus_target_interval_length_seconds{quantile=\"0.9\"} \u003e 60\n for: 5m\n labels:\n severity: warning\n annotations:\n summary: \"Prometheus target scraping slow (instance {{ $labels.instance }}).\"\n description: \"Prometheus is scraping exporters slowly.\"\n - alert: PrometheusTsdbCompactionsFailed\n expr: increase(prometheus_tsdb_compactions_failed_total[1m]) \u003e 0\n for: 0m\n labels:\n severity: critical\n annotations:\n summary: \"Prometheus TSDB compactions failed (instance {{ $labels.instance }}).\"\n description: \"Prometheus encountered {{ $value }} TSDB compactions failures.\"\n - alert: PrometheusTsdbHeadTruncationsFailed\n expr: increase(prometheus_tsdb_head_truncations_failed_total[1m]) \u003e 0\n for: 0m\n labels:\n severity: critical\n annotations:\n summary: \"Prometheus TSDB head truncations failed (instance {{ $labels.instance }}).\"\n description: \"Prometheus encountered {{ $value }} TSDB head truncation failures.\"\n - alert: PrometheusTsdbWalCorruptions\n expr: increase(prometheus_tsdb_wal_corruptions_total[1m]) \u003e 0\n for: 0m\n labels:\n severity: critical\n annotations:\n summary: \"Prometheus TSDB WAL corruptions (instance {{ $labels.instance }}).\"\n description: \"Prometheus encountered {{ $value }} TSDB WAL corruptions.\"\n - alert: PrometheusTsdbWalTruncationsFailed\n expr: increase(prometheus_tsdb_wal_truncations_failed_total[1m]) \u003e 0\n for: 0m\n labels:\n severity: critical\n annotations:\n summary: \"Prometheus TSDB WAL truncations failed (instance {{ $labels.instance }}).\"\n description: \"Prometheus encountered {{ $value }} TSDB WAL truncation failures.\"\nEOH\n }\n\n template {\n change_mode = \"noop\"\n change_signal = \"SIGINT\"\n destination = \"secrets/prometheus.yml\"\n data = \u003c\u003cEOH\n---\nglobal:\n scrape_interval: 5s\n scrape_timeout: 5s\n evaluation_interval: 5s\n\nalerting:\n alertmanagers:\n - consul_sd_configs:\n - server: '{{ env \"NOMAD_IP_prometheus\" }}:8500'\n services: [ 'alertmanager' ]\n\nrule_files:\n - 'alerts.yml'\n\nscrape_configs:\n\n - job_name: 'Nomad Cluster'\n consul_sd_configs:\n - server: '{{ env \"NOMAD_IP_prometheus\" }}:8500'\n services: [ 'nomad-client', 'nomad' ]\n relabel_configs:\n - source_labels: [__meta_consul_tags]\n regex: '(.*)http(.*)'\n action: keep\n metrics_path: /v1/metrics\n params:\n format: [ 'prometheus' ]\n\n - job_name: 'Consul Cluster'\n static_configs:\n - targets: [ '10.30.51.30:8500', '10.30.51.32:8500', '10.30.51.33:8500' ]\n metrics_path: /v1/agent/metrics\n params:\n format: [ 'prometheus' ]\n\n - job_name: 'Alertmanager'\n consul_sd_configs:\n - server: '{{ env \"NOMAD_IP_prometheus\" }}:8500'\n services: [ 'alertmanager' ]\n\n - job_name: 'Blackbox Exporter (icmp)'\n static_configs:\n - targets: [ 'gerrit.fd.io' ]\n - targets: [ 'jenkins.fd.io' ]\n - targets: [ '10.30.51.32' ]\n params:\n module: [ 'icmp_v4' ]\n relabel_configs:\n - source_labels: [__address__]\n target_label: __param_target\n - source_labels: [__param_target]\n target_label: instance\n - target_label: __address__\n replacement: localhost:9115\n metrics_path: /probe\n\n - job_name: 'Blackbox Exporter (http)'\n static_configs:\n - targets: [ 'gerrit.fd.io' ]\n - targets: [ 'jenkins.fd.io' ]\n params:\n module: [ 'http_2xx' ]\n relabel_configs:\n - source_labels: [__address__]\n target_label: __param_target\n - source_labels: [__param_target]\n target_label: instance\n - target_label: __address__\n replacement: localhost:9115\n metrics_path: /probe\n\n - job_name: 'cAdvisor Exporter'\n consul_sd_configs:\n - server: '{{ env \"NOMAD_IP_prometheus\" }}:8500'\n services: [ 'cadvisorexporter' ]\n\n - job_name: 'Grafana'\n consul_sd_configs:\n - server: '{{ env \"NOMAD_IP_prometheus\" }}:8500'\n services: [ 'grafana' ]\n\n - job_name: 'Node Exporter'\n consul_sd_configs:\n - server: '{{ env \"NOMAD_IP_prometheus\" }}:8500'\n services: [ 'nodeexporter' ]\n\n - job_name: 'Prometheus'\n consul_sd_configs:\n - server: '{{ env \"NOMAD_IP_prometheus\" }}:8500'\n services: [ 'prometheus' ]\n\n - job_name: 'Minio'\n bearer_token: eyJhbGciOiJIUzUxMiIsInR5cCI6IkpXVCJ9.eyJleHAiOjQ3NjQ1ODEzMzcsImlzcyI6InByb21ldGhldXMiLCJzdWIiOiJtaW5pbyJ9.oeTw3EIaiFmlDikrHXWiWXMH2vxLfDLkfjEC7G2N3M_keH_xyA_l2ofLLNYtopa_3GCEZnxLQdPuFZrmgpkDWg\n consul_sd_configs:\n - server: '{{ env \"NOMAD_IP_prometheus\" }}:8500'\n services: [ 'storage' ]\n metrics_path: /minio/prometheus/metrics\nEOH\n }\n\n # The service stanza instructs Nomad to register a service with Consul.\n #\n # For more information and examples on the \"task\" stanza, please see\n # the online documentation at:\n #\n # https://www.nomadproject.io/docs/job-specification/service\n #\n service {\n name = \"prometheus\"\n port = \"prometheus\"\n tags = [ \"prometheus${NOMAD_ALLOC_INDEX}\" ]\n check {\n name = \"Prometheus Check Live\"\n type = \"http\"\n path = \"/-/healthy\"\n interval = \"10s\"\n timeout = \"2s\"\n }\n }\n\n # The \"resources\" stanza describes the requirements a task needs to\n # execute. Resource requirements include memory, network, cpu, and more.\n # This ensures the task will execute on a machine that contains enough\n # resource capacity.\n #\n # For more information and examples on the \"resources\" stanza, please see\n # the online documentation at:\n #\n # https://www.nomadproject.io/docs/job-specification/resources\n #\n resources {\n cpu = 2000\n memory = 8192\n # The network stanza specifies the networking requirements for the task\n # group, including the network mode and port allocations. When scheduling\n # jobs in Nomad they are provisioned across your fleet of machines along\n # with other jobs and services. Because you don't know in advance what host\n # your job will be provisioned on, Nomad will provide your tasks with\n # network configuration when they start up.\n #\n # For more information and examples on the \"template\" stanza, please see\n # the online documentation at:\n #\n # https://www.nomadproject.io/docs/job-specification/network\n #\n network {\n port \"prometheus\" {\n static = 9090\n }\n }\n }\n }\n }\n}",
+ "json": null,
+ "modify_index": "7147932",
+ "name": "prod-prometheus",
+ "namespace": "default",
+ "policy_override": null,
+ "purge_on_destroy": null,
+ "region": "global",
+ "task_groups": [
+ {
+ "count": 4,
+ "meta": {},
+ "name": "prod-group1-prometheus",
+ "task": [
+ {
+ "driver": "exec",
+ "meta": {},
+ "name": "prod-task1-prometheus",
+ "volume_mounts": [
+ {
+ "destination": "/data/",
+ "read_only": false,
+ "volume": "prod-volume1-prometheus"
+ }
+ ]
+ }
+ ],
+ "volumes": [
+ {
+ "name": "prod-volume1-prometheus",
+ "read_only": false,
+ "source": "prod-volume-data1-1",
+ "type": "host"
+ }
+ ]
+ }
+ ],
+ "type": "service"
+ },
+ "sensitive_attributes": [],
+ "private": "bnVsbA==",
+ "dependencies": [
+ "module.prometheus.data.template_file.nomad_job_prometheus"
+ ]
+ }
+ ]
+ },
+ {
+ "module": "module.vpp_device",
+ "mode": "data",
+ "type": "template_file",
+ "name": "nomad_job_csit_shim",
+ "provider": "provider[\"registry.terraform.io/hashicorp/template\"]",
+ "instances": [
+ {
+ "schema_version": 0,
+ "attributes": {
+ "filename": null,
+ "id": "a285223159ce0af9e5427d857021e5651c09408af90decd3b93a90efd4fd7ce8",
+ "rendered": "job \"prod-device-csit-shim\" {\n # The \"region\" parameter specifies the region in which to execute the job.\n # If omitted, this inherits the default region name of \"global\".\n # region = \"global\"\n #\n # The \"datacenters\" parameter specifies the list of datacenters which should\n # be considered when placing this task. This must be provided.\n datacenters = \"yul1\"\n\n # The \"type\" parameter controls the type of job, which impacts the scheduler's\n # decision on placement. This configuration is optional and defaults to\n # \"service\". For a full list of job types and their differences, please see\n # the online documentation.\n #\n # For more information, please see the online documentation at:\n #\n # https://www.nomadproject.io/docs/jobspec/schedulers.html\n #\n type = \"system\"\n\n # The \"group\" stanza defines a series of tasks that should be co-located on\n # the same Nomad client. Any task within a group will be placed on the same\n # client.\n #\n # For more information and examples on the \"group\" stanza, please see\n # the online documentation at:\n #\n # https://www.nomadproject.io/docs/job-specification/group.html\n #\n group \"prod-group1-csit-shim-amd\" {\n # The \"count\" parameter specifies the number of the task groups that should\n # be running under this group. This value must be non-negative and defaults\n # to 1.\n count = 1\n\n constraint {\n attribute = \"${node.class}\"\n value = \"csit\"\n }\n\n restart {\n interval = \"1m\"\n attempts = 3\n delay = \"15s\"\n mode = \"delay\"\n }\n\n # The \"task\" stanza creates an individual unit of work, such as a Docker\n # container, web application, or batch processing.\n #\n # For more information and examples on the \"task\" stanza, please see\n # the online documentation at:\n #\n # https://www.nomadproject.io/docs/job-specification/task.html\n #\n task \"prod-task1-csit-shim-amd\" {\n # The \"driver\" parameter specifies the task driver that should be used to\n # run the task.\n driver = \"docker\"\n\n # The \"config\" stanza specifies the driver configuration, which is passed\n # directly to the driver to start the task. The details of configurations\n # are specific to each driver, so please see specific driver\n # documentation for more information.\n config {\n image = \"csit_shim-ubuntu1804:local\"\n network_mode = \"host\"\n pid_mode = \"host\"\n volumes = [\n \"/var/run/docker.sock:/var/run/docker.sock\"\n ]\n privileged = true\n }\n\n # The \"resources\" stanza describes the requirements a task needs to\n # execute. Resource requirements include memory, network, cpu, and more.\n # This ensures the task will execute on a machine that contains enough\n # resource capacity.\n #\n # For more information and examples on the \"resources\" stanza, please see\n # the online documentation at:\n #\n # https://www.nomadproject.io/docs/job-specification/resources.html\n #\n resources {\n cpu = 1000\n memory = 5000\n network {\n port \"ssh\" {\n static = 6022\n }\n port \"ssh2\" {\n static = 6023\n }\n }\n }\n }\n }\n\n group \"prod-group1-csit-shim-arm\" {\n # The \"count\" parameter specifies the number of the task groups that should\n # be running under this group. This value must be non-negative and defaults\n # to 1.\n count = 1\n\n constraint {\n attribute = \"${node.class}\"\n value = \"csitarm\"\n }\n\n restart {\n interval = \"1m\"\n attempts = 3\n delay = \"15s\"\n mode = \"delay\"\n }\n\n # The \"task\" stanza creates an individual unit of work, such as a Docker\n # container, web application, or batch processing.\n #\n # For more information and examples on the \"task\" stanza, please see\n # the online documentation at:\n #\n # https://www.nomadproject.io/docs/job-specification/task.html\n #\n task \"prod-task1-csit-shim-arm\" {\n # The \"driver\" parameter specifies the task driver that should be used to\n # run the task.\n driver = \"docker\"\n\n # The \"config\" stanza specifies the driver configuration, which is passed\n # directly to the driver to start the task. The details of configurations\n # are specific to each driver, so please see specific driver\n # documentation for more information.\n config {\n image = \"csit_shim-ubuntu1804:local\"\n network_mode = \"host\"\n pid_mode = \"host\"\n volumes = [\n \"/var/run/docker.sock:/var/run/docker.sock\"\n ]\n privileged = true\n }\n\n # The \"resources\" stanza describes the requirements a task needs to\n # execute. Resource requirements include memory, network, cpu, and more.\n # This ensures the task will execute on a machine that contains enough\n # resource capacity.\n #\n # For more information and examples on the \"resources\" stanza, please see\n # the online documentation at:\n #\n # https://www.nomadproject.io/docs/job-specification/resources.html\n #\n resources {\n cpu = 1000\n memory = 5000\n network {\n port \"ssh\" {\n static = 6022\n }\n port \"ssh2\" {\n static = 6023\n }\n }\n }\n }\n }\n}",
+ "template": "job \"${job_name}\" {\n # The \"region\" parameter specifies the region in which to execute the job.\n # If omitted, this inherits the default region name of \"global\".\n # region = \"global\"\n #\n # The \"datacenters\" parameter specifies the list of datacenters which should\n # be considered when placing this task. This must be provided.\n datacenters = \"${datacenters}\"\n\n # The \"type\" parameter controls the type of job, which impacts the scheduler's\n # decision on placement. This configuration is optional and defaults to\n # \"service\". For a full list of job types and their differences, please see\n # the online documentation.\n #\n # For more information, please see the online documentation at:\n #\n # https://www.nomadproject.io/docs/jobspec/schedulers.html\n #\n type = \"system\"\n\n # The \"group\" stanza defines a series of tasks that should be co-located on\n # the same Nomad client. Any task within a group will be placed on the same\n # client.\n #\n # For more information and examples on the \"group\" stanza, please see\n # the online documentation at:\n #\n # https://www.nomadproject.io/docs/job-specification/group.html\n #\n group \"prod-group1-csit-shim-amd\" {\n # The \"count\" parameter specifies the number of the task groups that should\n # be running under this group. This value must be non-negative and defaults\n # to 1.\n count = ${group_count}\n\n constraint {\n attribute = \"$${node.class}\"\n value = \"csit\"\n }\n\n restart {\n interval = \"1m\"\n attempts = 3\n delay = \"15s\"\n mode = \"delay\"\n }\n\n # The \"task\" stanza creates an individual unit of work, such as a Docker\n # container, web application, or batch processing.\n #\n # For more information and examples on the \"task\" stanza, please see\n # the online documentation at:\n #\n # https://www.nomadproject.io/docs/job-specification/task.html\n #\n task \"prod-task1-csit-shim-amd\" {\n # The \"driver\" parameter specifies the task driver that should be used to\n # run the task.\n driver = \"docker\"\n\n # The \"config\" stanza specifies the driver configuration, which is passed\n # directly to the driver to start the task. The details of configurations\n # are specific to each driver, so please see specific driver\n # documentation for more information.\n config {\n image = \"csit_shim-ubuntu1804:local\"\n network_mode = \"host\"\n pid_mode = \"host\"\n volumes = [\n \"/var/run/docker.sock:/var/run/docker.sock\"\n ]\n privileged = true\n }\n\n # The \"resources\" stanza describes the requirements a task needs to\n # execute. Resource requirements include memory, network, cpu, and more.\n # This ensures the task will execute on a machine that contains enough\n # resource capacity.\n #\n # For more information and examples on the \"resources\" stanza, please see\n # the online documentation at:\n #\n # https://www.nomadproject.io/docs/job-specification/resources.html\n #\n resources {\n cpu = ${cpu}\n memory = ${mem}\n network {\n port \"ssh\" {\n static = 6022\n }\n port \"ssh2\" {\n static = 6023\n }\n }\n }\n }\n }\n\n group \"prod-group1-csit-shim-arm\" {\n # The \"count\" parameter specifies the number of the task groups that should\n # be running under this group. This value must be non-negative and defaults\n # to 1.\n count = ${group_count}\n\n constraint {\n attribute = \"$${node.class}\"\n value = \"csitarm\"\n }\n\n restart {\n interval = \"1m\"\n attempts = 3\n delay = \"15s\"\n mode = \"delay\"\n }\n\n # The \"task\" stanza creates an individual unit of work, such as a Docker\n # container, web application, or batch processing.\n #\n # For more information and examples on the \"task\" stanza, please see\n # the online documentation at:\n #\n # https://www.nomadproject.io/docs/job-specification/task.html\n #\n task \"prod-task1-csit-shim-arm\" {\n # The \"driver\" parameter specifies the task driver that should be used to\n # run the task.\n driver = \"docker\"\n\n # The \"config\" stanza specifies the driver configuration, which is passed\n # directly to the driver to start the task. The details of configurations\n # are specific to each driver, so please see specific driver\n # documentation for more information.\n config {\n image = \"csit_shim-ubuntu1804:local\"\n network_mode = \"host\"\n pid_mode = \"host\"\n volumes = [\n \"/var/run/docker.sock:/var/run/docker.sock\"\n ]\n privileged = true\n }\n\n # The \"resources\" stanza describes the requirements a task needs to\n # execute. Resource requirements include memory, network, cpu, and more.\n # This ensures the task will execute on a machine that contains enough\n # resource capacity.\n #\n # For more information and examples on the \"resources\" stanza, please see\n # the online documentation at:\n #\n # https://www.nomadproject.io/docs/job-specification/resources.html\n #\n resources {\n cpu = ${cpu}\n memory = ${mem}\n network {\n port \"ssh\" {\n static = 6022\n }\n port \"ssh2\" {\n static = 6023\n }\n }\n }\n }\n }\n}",
+ "vars": {
+ "cpu": "1000",
+ "datacenters": "yul1",
+ "group_count": "1",
+ "job_name": "prod-device-csit-shim",
+ "mem": "5000"
+ }
+ },
+ "sensitive_attributes": []
+ }
+ ]
+ },
+ {
+ "module": "module.vpp_device",
+ "mode": "managed",
+ "type": "nomad_job",
+ "name": "nomad_job_csit_shim",
+ "provider": "provider[\"registry.terraform.io/hashicorp/nomad\"].yul1",
+ "instances": [
+ {
+ "schema_version": 0,
+ "attributes": {
+ "allocation_ids": [
+ "ecfcd9ad-b959-0fa4-c1ec-8b82195830f1",
+ "190f5699-0d10-7f85-ac68-96927702070b",
+ "2fbbef45-f00a-1367-08c4-c2239de9e78d",
+ "dd57bc83-2229-d242-1caf-b743f186e7a2"
+ ],
+ "datacenters": [
+ "yul1"
+ ],
+ "deployment_id": "",
+ "deployment_status": "",
+ "deregister_on_destroy": true,
+ "deregister_on_id_change": true,
+ "detach": false,
+ "id": "prod-device-csit-shim",
+ "jobspec": "job \"prod-device-csit-shim\" {\n # The \"region\" parameter specifies the region in which to execute the job.\n # If omitted, this inherits the default region name of \"global\".\n # region = \"global\"\n #\n # The \"datacenters\" parameter specifies the list of datacenters which should\n # be considered when placing this task. This must be provided.\n datacenters = \"yul1\"\n\n # The \"type\" parameter controls the type of job, which impacts the scheduler's\n # decision on placement. This configuration is optional and defaults to\n # \"service\". For a full list of job types and their differences, please see\n # the online documentation.\n #\n # For more information, please see the online documentation at:\n #\n # https://www.nomadproject.io/docs/jobspec/schedulers.html\n #\n type = \"system\"\n\n # The \"group\" stanza defines a series of tasks that should be co-located on\n # the same Nomad client. Any task within a group will be placed on the same\n # client.\n #\n # For more information and examples on the \"group\" stanza, please see\n # the online documentation at:\n #\n # https://www.nomadproject.io/docs/job-specification/group.html\n #\n group \"prod-group1-csit-shim-amd\" {\n # The \"count\" parameter specifies the number of the task groups that should\n # be running under this group. This value must be non-negative and defaults\n # to 1.\n count = 1\n\n constraint {\n attribute = \"${node.class}\"\n value = \"csit\"\n }\n\n restart {\n interval = \"1m\"\n attempts = 3\n delay = \"15s\"\n mode = \"delay\"\n }\n\n # The \"task\" stanza creates an individual unit of work, such as a Docker\n # container, web application, or batch processing.\n #\n # For more information and examples on the \"task\" stanza, please see\n # the online documentation at:\n #\n # https://www.nomadproject.io/docs/job-specification/task.html\n #\n task \"prod-task1-csit-shim-amd\" {\n # The \"driver\" parameter specifies the task driver that should be used to\n # run the task.\n driver = \"docker\"\n\n # The \"config\" stanza specifies the driver configuration, which is passed\n # directly to the driver to start the task. The details of configurations\n # are specific to each driver, so please see specific driver\n # documentation for more information.\n config {\n image = \"csit_shim-ubuntu1804:local\"\n network_mode = \"host\"\n pid_mode = \"host\"\n volumes = [\n \"/var/run/docker.sock:/var/run/docker.sock\"\n ]\n privileged = true\n }\n\n # The \"resources\" stanza describes the requirements a task needs to\n # execute. Resource requirements include memory, network, cpu, and more.\n # This ensures the task will execute on a machine that contains enough\n # resource capacity.\n #\n # For more information and examples on the \"resources\" stanza, please see\n # the online documentation at:\n #\n # https://www.nomadproject.io/docs/job-specification/resources.html\n #\n resources {\n cpu = 1000\n memory = 5000\n network {\n port \"ssh\" {\n static = 6022\n }\n port \"ssh2\" {\n static = 6023\n }\n }\n }\n }\n }\n\n group \"prod-group1-csit-shim-arm\" {\n # The \"count\" parameter specifies the number of the task groups that should\n # be running under this group. This value must be non-negative and defaults\n # to 1.\n count = 1\n\n constraint {\n attribute = \"${node.class}\"\n value = \"csitarm\"\n }\n\n restart {\n interval = \"1m\"\n attempts = 3\n delay = \"15s\"\n mode = \"delay\"\n }\n\n # The \"task\" stanza creates an individual unit of work, such as a Docker\n # container, web application, or batch processing.\n #\n # For more information and examples on the \"task\" stanza, please see\n # the online documentation at:\n #\n # https://www.nomadproject.io/docs/job-specification/task.html\n #\n task \"prod-task1-csit-shim-arm\" {\n # The \"driver\" parameter specifies the task driver that should be used to\n # run the task.\n driver = \"docker\"\n\n # The \"config\" stanza specifies the driver configuration, which is passed\n # directly to the driver to start the task. The details of configurations\n # are specific to each driver, so please see specific driver\n # documentation for more information.\n config {\n image = \"csit_shim-ubuntu1804:local\"\n network_mode = \"host\"\n pid_mode = \"host\"\n volumes = [\n \"/var/run/docker.sock:/var/run/docker.sock\"\n ]\n privileged = true\n }\n\n # The \"resources\" stanza describes the requirements a task needs to\n # execute. Resource requirements include memory, network, cpu, and more.\n # This ensures the task will execute on a machine that contains enough\n # resource capacity.\n #\n # For more information and examples on the \"resources\" stanza, please see\n # the online documentation at:\n #\n # https://www.nomadproject.io/docs/job-specification/resources.html\n #\n resources {\n cpu = 1000\n memory = 5000\n network {\n port \"ssh\" {\n static = 6022\n }\n port \"ssh2\" {\n static = 6023\n }\n }\n }\n }\n }\n}",
+ "json": null,
+ "modify_index": "7077736",
+ "name": "prod-device-csit-shim",
+ "namespace": "default",
+ "policy_override": null,
+ "purge_on_destroy": null,
+ "region": "global",
+ "task_groups": [
+ {
+ "count": 1,
+ "meta": {},
+ "name": "prod-group1-csit-shim-amd",
+ "task": [
+ {
+ "driver": "docker",
+ "meta": {},
+ "name": "prod-task1-csit-shim-amd",
+ "volume_mounts": []
+ }
+ ],
+ "volumes": []
+ },
+ {
+ "count": 1,
+ "meta": {},
+ "name": "prod-group1-csit-shim-arm",
+ "task": [
+ {
+ "driver": "docker",
+ "meta": {},
+ "name": "prod-task1-csit-shim-arm",
+ "volume_mounts": []
+ }
+ ],
+ "volumes": []
+ }
+ ],
+ "type": "system"
+ },
+ "sensitive_attributes": [],
+ "private": "bnVsbA==",
+ "dependencies": [
+ "module.vpp_device.data.template_file.nomad_job_csit_shim"
+ ]
+ }
+ ]
+ }
+ ]
+}
+>>>>>>> CHANGE (a44eef Infra: Monitoring capability)
diff --git a/terraform-ci-infra/1n_nmd/vpp_device/conf/nomad/csit_shim.hcl b/terraform-ci-infra/1n_nmd/vpp_device/conf/nomad/csit_shim.hcl
new file mode 100644
index 0000000000..cf065429d1
--- /dev/null
+++ b/terraform-ci-infra/1n_nmd/vpp_device/conf/nomad/csit_shim.hcl
@@ -0,0 +1,169 @@
+job "${job_name}" {
+ # The "region" parameter specifies the region in which to execute the job.
+ # If omitted, this inherits the default region name of "global".
+ # region = "global"
+ #
+ # The "datacenters" parameter specifies the list of datacenters which should
+ # be considered when placing this task. This must be provided.
+ datacenters = "${datacenters}"
+
+ # The "type" parameter controls the type of job, which impacts the scheduler's
+ # decision on placement. This configuration is optional and defaults to
+ # "service". For a full list of job types and their differences, please see
+ # the online documentation.
+ #
+ # For more information, please see the online documentation at:
+ #
+ # https://www.nomadproject.io/docs/jobspec/schedulers.html
+ #
+ type = "system"
+
+ # The "group" stanza defines a series of tasks that should be co-located on
+ # the same Nomad client. Any task within a group will be placed on the same
+ # client.
+ #
+ # For more information and examples on the "group" stanza, please see
+ # the online documentation at:
+ #
+ # https://www.nomadproject.io/docs/job-specification/group.html
+ #
+ group "prod-group1-csit-shim-amd" {
+ # The "count" parameter specifies the number of the task groups that should
+ # be running under this group. This value must be non-negative and defaults
+ # to 1.
+ count = ${group_count}
+
+ constraint {
+ attribute = "$${node.class}"
+ value = "csit"
+ }
+
+ restart {
+ interval = "1m"
+ attempts = 3
+ delay = "15s"
+ mode = "delay"
+ }
+
+ # The "task" stanza creates an individual unit of work, such as a Docker
+ # container, web application, or batch processing.
+ #
+ # For more information and examples on the "task" stanza, please see
+ # the online documentation at:
+ #
+ # https://www.nomadproject.io/docs/job-specification/task.html
+ #
+ task "prod-task1-csit-shim-amd" {
+ # The "driver" parameter specifies the task driver that should be used to
+ # run the task.
+ driver = "docker"
+
+ # The "config" stanza specifies the driver configuration, which is passed
+ # directly to the driver to start the task. The details of configurations
+ # are specific to each driver, so please see specific driver
+ # documentation for more information.
+ config {
+ image = "csit_shim-ubuntu1804:local"
+ network_mode = "host"
+ pid_mode = "host"
+ volumes = [
+ "/var/run/docker.sock:/var/run/docker.sock"
+ ]
+ privileged = true
+ }
+
+ # The "resources" stanza describes the requirements a task needs to
+ # execute. Resource requirements include memory, network, cpu, and more.
+ # This ensures the task will execute on a machine that contains enough
+ # resource capacity.
+ #
+ # For more information and examples on the "resources" stanza, please see
+ # the online documentation at:
+ #
+ # https://www.nomadproject.io/docs/job-specification/resources.html
+ #
+ resources {
+ cpu = ${cpu}
+ memory = ${mem}
+ network {
+ port "ssh" {
+ static = 6022
+ }
+ port "ssh2" {
+ static = 6023
+ }
+ }
+ }
+ }
+ }
+
+ group "prod-group1-csit-shim-arm" {
+ # The "count" parameter specifies the number of the task groups that should
+ # be running under this group. This value must be non-negative and defaults
+ # to 1.
+ count = ${group_count}
+
+ constraint {
+ attribute = "$${node.class}"
+ value = "csitarm"
+ }
+
+ restart {
+ interval = "1m"
+ attempts = 3
+ delay = "15s"
+ mode = "delay"
+ }
+
+ # The "task" stanza creates an individual unit of work, such as a Docker
+ # container, web application, or batch processing.
+ #
+ # For more information and examples on the "task" stanza, please see
+ # the online documentation at:
+ #
+ # https://www.nomadproject.io/docs/job-specification/task.html
+ #
+ task "prod-task1-csit-shim-arm" {
+ # The "driver" parameter specifies the task driver that should be used to
+ # run the task.
+ driver = "docker"
+
+ # The "config" stanza specifies the driver configuration, which is passed
+ # directly to the driver to start the task. The details of configurations
+ # are specific to each driver, so please see specific driver
+ # documentation for more information.
+ config {
+ image = "csit_shim-ubuntu1804:local"
+ network_mode = "host"
+ pid_mode = "host"
+ volumes = [
+ "/var/run/docker.sock:/var/run/docker.sock"
+ ]
+ privileged = true
+ }
+
+ # The "resources" stanza describes the requirements a task needs to
+ # execute. Resource requirements include memory, network, cpu, and more.
+ # This ensures the task will execute on a machine that contains enough
+ # resource capacity.
+ #
+ # For more information and examples on the "resources" stanza, please see
+ # the online documentation at:
+ #
+ # https://www.nomadproject.io/docs/job-specification/resources.html
+ #
+ resources {
+ cpu = ${cpu}
+ memory = ${mem}
+ network {
+ port "ssh" {
+ static = 6022
+ }
+ port "ssh2" {
+ static = 6023
+ }
+ }
+ }
+ }
+ }
+} \ No newline at end of file
diff --git a/terraform-ci-infra/1n_nmd/vpp_device/main.tf b/terraform-ci-infra/1n_nmd/vpp_device/main.tf
new file mode 100644
index 0000000000..c4a3e27970
--- /dev/null
+++ b/terraform-ci-infra/1n_nmd/vpp_device/main.tf
@@ -0,0 +1,19 @@
+locals {
+ datacenters = join(",", var.nomad_datacenters)
+}
+
+data "template_file" "nomad_job_csit_shim" {
+ template = file("${path.module}/conf/nomad/csit_shim.hcl")
+ vars = {
+ datacenters = local.datacenters
+ job_name = var.csit_shim_job_name
+ group_count = var.csit_shim_group_count
+ cpu = var.csit_shim_cpu
+ mem = var.csit_shim_mem
+ }
+}
+
+resource "nomad_job" "nomad_job_csit_shim" {
+ jobspec = data.template_file.nomad_job_csit_shim.rendered
+ detach = false
+} \ No newline at end of file
diff --git a/terraform-ci-infra/1n_nmd/vpp_device/variables.tf b/terraform-ci-infra/1n_nmd/vpp_device/variables.tf
new file mode 100644
index 0000000000..ca574e770a
--- /dev/null
+++ b/terraform-ci-infra/1n_nmd/vpp_device/variables.tf
@@ -0,0 +1,31 @@
+# Nomad
+variable "nomad_datacenters" {
+ description = "Nomad data centers"
+ type = list(string)
+ default = [ "dc1" ]
+}
+
+# CSIT SHIM
+variable "csit_shim_job_name" {
+ description = "CSIT SHIM job name"
+ type = string
+ default = "prod-csit-shim"
+}
+
+variable "csit_shim_group_count" {
+ description = "Number of CSIT SHIM group instances"
+ type = number
+ default = 1
+}
+
+variable "csit_shim_cpu" {
+ description = "CSIT SHIM task CPU"
+ type = number
+ default = 2000
+}
+
+variable "csit_shim_mem" {
+ description = "CSIT SHIM task memory"
+ type = number
+ default = 10000
+} \ No newline at end of file