diff options
29 files changed, 417 insertions, 1110 deletions
diff --git a/resources/tools/testbed-setup/ansible/nomad.yaml b/resources/tools/testbed-setup/ansible/nomad.yaml index 653215651a..134fffcb33 100644 --- a/resources/tools/testbed-setup/ansible/nomad.yaml +++ b/resources/tools/testbed-setup/ansible/nomad.yaml @@ -15,4 +15,8 @@ - role: nomad tags: nomad - role: consul - tags: consul
\ No newline at end of file + tags: consul + - role: prometheus_exporter + tags: prometheus_exporter + - role: cadvisor + tags: cadvisor
\ No newline at end of file diff --git a/resources/tools/testbed-setup/ansible/roles/ab/tasks/main.yaml b/resources/tools/testbed-setup/ansible/roles/ab/tasks/main.yaml index c422b4d2e0..85003d583d 100644 --- a/resources/tools/testbed-setup/ansible/roles/ab/tasks/main.yaml +++ b/resources/tools/testbed-setup/ansible/roles/ab/tasks/main.yaml @@ -1,15 +1,18 @@ --- # file: roles/ab/tasks/main.yaml -- Name: Update package cache (apt) +- name: Inst - Update Package Cache (APT) apt: update_cache: yes cache_valid_time: 3600 when: - ansible_distribution|lower == 'ubuntu' + tags: + - ab-inst-prerequisites -- name: Install Apache ab tools +- name: Inst - Apache ab tools package: name: "{{ packages | flatten(levels=1) }}" state: present - tags: install-ab + tags: + - ab-inst-ab
\ No newline at end of file diff --git a/resources/tools/testbed-setup/ansible/roles/cadvisor/defaults/main.yaml b/resources/tools/testbed-setup/ansible/roles/cadvisor/defaults/main.yaml new file mode 100644 index 0000000000..3b25e551ea --- /dev/null +++ b/resources/tools/testbed-setup/ansible/roles/cadvisor/defaults/main.yaml @@ -0,0 +1,24 @@ +--- +# file: roles/cadvisor/defaults/main.yaml + +packages: "{{ packages_base + packages_by_distro[ansible_distribution | lower] + packages_by_arch[ansible_machine] }}" + +packages_base: + - [] + +packages_by_distro: + ubuntu: + - "python3-docker" + - "python3-dockerpty" + +packages_by_arch: + aarch64: + - [] + x86_64: + - [] + +image: "{{ image_by_arch[ansible_machine] }}" + +image_by_arch: + aarch64: "zcube/cadvisor:v0.37.0" + x86_64: "gcr.io/cadvisor/cadvisor:v0.38.7"
\ No newline at end of file diff --git a/resources/tools/testbed-setup/ansible/roles/cadvisor/tasks/main.yaml b/resources/tools/testbed-setup/ansible/roles/cadvisor/tasks/main.yaml new file mode 100644 index 0000000000..a2a13368c2 --- /dev/null +++ b/resources/tools/testbed-setup/ansible/roles/cadvisor/tasks/main.yaml @@ -0,0 +1,39 @@ +--- +# file: roles/cadvisor/tasks/main.yaml + +- name: Inst - Update Package Cache (APT) + apt: + update_cache: yes + cache_valid_time: 3600 + when: + - ansible_distribution|lower == 'ubuntu' + tags: + - cadvisor-inst-prerequisites + +- name: Inst - Prerequisites + package: + name: "{{ packages | flatten(levels=1) }}" + state: latest + tags: + - cadvisor-inst-prerequisites + +- name: Inst - Start a container + docker_container: + name: "cAdvisor" + image: "{{ image }}" + state: "started" + restart_policy: "unless-stopped" + detach: yes + devices: + - "/dev/kmsg" + ports: + - "8080:8080" + privileged: yes + volumes: + - "/:/rootfs:ro" + - "/var/run:/var/run:ro" + - "/sys:/sys:ro" + - "/var/lib/docker/:/var/lib/docker:ro" + - "/dev/disk/:/dev/disk:ro" + tags: + - cadvisor-run-container diff --git a/resources/tools/testbed-setup/ansible/roles/calibration/tasks/main.yaml b/resources/tools/testbed-setup/ansible/roles/calibration/tasks/main.yaml index 877304cc36..5d0c3b18ff 100644 --- a/resources/tools/testbed-setup/ansible/roles/calibration/tasks/main.yaml +++ b/resources/tools/testbed-setup/ansible/roles/calibration/tasks/main.yaml @@ -1,19 +1,21 @@ --- # file: roles/calibration/tasks/main.yaml -- Name: Update package cache (apt) +- name: Inst - Update Package Cache (APT) apt: update_cache: yes cache_valid_time: 3600 when: - ansible_distribution|lower == 'ubuntu' + tags: + - calibration-inst-prerequisites -- name: Install Distribution - Release - Machine Prerequisites +- name: Inst - Prerequisites package: name: "{{ packages | flatten(levels=1) }}" state: latest tags: - - install-dependencies + - calibration-inst-prerequisites - name: Check CPU Power States shell: "lscpu" diff --git a/resources/tools/testbed-setup/ansible/roles/common/tasks/main.yaml b/resources/tools/testbed-setup/ansible/roles/common/tasks/main.yaml index 7be6f73951..35368dd45e 100644 --- a/resources/tools/testbed-setup/ansible/roles/common/tasks/main.yaml +++ b/resources/tools/testbed-setup/ansible/roles/common/tasks/main.yaml @@ -11,19 +11,21 @@ tags: - set-proxy -- Name: Update package cache (apt) +- name: Inst - Update package cache (apt) apt: update_cache: yes cache_valid_time: 3600 when: - ansible_distribution|lower == 'ubuntu' + tags: + - common-inst-prerequisites -- name: Install Distribution - Release - Machine Prerequisites +- name: Inst - Prerequisites package: name: "{{ packages | flatten(levels=1) }}" state: latest tags: - - install-dependencies + - common-inst-prerequisites - name: Install CSIT PIP requirements pip: diff --git a/resources/tools/testbed-setup/ansible/roles/consul/tasks/main.yaml b/resources/tools/testbed-setup/ansible/roles/consul/tasks/main.yaml index d4be90b40b..99ac52da44 100644 --- a/resources/tools/testbed-setup/ansible/roles/consul/tasks/main.yaml +++ b/resources/tools/testbed-setup/ansible/roles/consul/tasks/main.yaml @@ -1,12 +1,14 @@ --- # file: roles/consul/tasks/main.yaml -- Name: Update package cache (apt) +- name: Inst - Update Package Cache (APT) apt: update_cache: yes cache_valid_time: 3600 when: - ansible_distribution|lower == 'ubuntu' + tags: + - consul-inst-prerequisites - name: Inst - Prerequisites package: diff --git a/resources/tools/testbed-setup/ansible/roles/dpdk/tasks/main.yaml b/resources/tools/testbed-setup/ansible/roles/dpdk/tasks/main.yaml index a38a9ee429..27a5c2c9e3 100644 --- a/resources/tools/testbed-setup/ansible/roles/dpdk/tasks/main.yaml +++ b/resources/tools/testbed-setup/ansible/roles/dpdk/tasks/main.yaml @@ -1,21 +1,23 @@ --- # file: roles/dpdk/tasks/main.yaml -- Name: Update package cache (apt) +- name: Inst - Update Package Cache (APT) apt: update_cache: yes cache_valid_time: 3600 when: - ansible_distribution|lower == 'ubuntu' + tags: + - dpdk-inst-prerequisites -- name: DPDK - Install Distribution - Release - Machine Prerequisites +- name: Inst - Prerequisites package: name: "{{ packages | flatten(levels=1) }}" state: latest tags: - - install-dependencies + - dpdk-inst-prerequisites -- name: DPDK - Download Release Archive +- name: Download Release Archive get_url: url: "{{ dpdk_url }}/dpdk-{{ item }}.tar.xz" dest: "{{ dpdk_target_dir }}/dpdk-{{ item }}.tar.xz" @@ -25,7 +27,7 @@ tags: - install-dpdk -- name: DPDK - Extract Release Archive +- name: Extract Release Archive unarchive: remote_src: true src: "{{ dpdk_target_dir }}/dpdk-{{ item }}.tar.xz" @@ -37,7 +39,7 @@ tags: - install-dpdk -- name: DPDK - Build igb_uio by default +- name: Build igb_uio by default lineinfile: dest: "{{ dpdk_target_dir }}/dpdk-{{ item }}/config/common_base" regexp: "^CONFIG_RTE_EAL_IGB_UIO" @@ -48,7 +50,7 @@ tags: - install-dpdk -- name: DPDK - Compile Release I +- name: Compile Release I become: yes command: "make install T={{ dpdk_build_targets[item][ansible_machine] }} DESTDIR={{ dpdk_target_dir }}/dpdk-{{ item }} chdir={{ dpdk_target_dir }}/dpdk-{{ item }}" loop: "{{ dpdk_version }}" @@ -57,7 +59,7 @@ tags: - install-dpdk -- name: DPDK - Link igb_uio Module +- name: Link igb_uio Module shell: "ln -fs {{ dpdk_target_dir }}/dpdk-{{ item }}/{{ dpdk_build_targets[item][ansible_machine] }}/kmod/igb_uio.ko /lib/modules/`uname -r`/igb_uio.ko && depmod -a" ignore_errors: "yes" loop: "{{ dpdk_version }}" diff --git a/resources/tools/testbed-setup/ansible/roles/iperf/tasks/main.yaml b/resources/tools/testbed-setup/ansible/roles/iperf/tasks/main.yaml index 00940e4958..853a00cb86 100644 --- a/resources/tools/testbed-setup/ansible/roles/iperf/tasks/main.yaml +++ b/resources/tools/testbed-setup/ansible/roles/iperf/tasks/main.yaml @@ -1,21 +1,23 @@ --- # file: roles/iperf/tasks/main.yaml -- Name: Update package cache (apt) +- name: Inst - Update Package Cache (APT) apt: update_cache: yes cache_valid_time: 3600 when: - ansible_distribution|lower == 'ubuntu' + tags: + - iperf-inst-prerequisites -- name: iPerf - Install Distribution - Release - Machine Prerequisites +- name: Inst - Prerequisites package: name: "{{ packages | flatten(levels=1) }}" state: latest tags: - - install-dependencies + - iperf-inst-prerequisites -- name: iPerf - Get Release Archive +- name: Get Release Archive get_url: url: "https://downloads.es.net/pub/iperf/iperf-{{ item }}.tar.gz" dest: "{{ iperf_target_dir }}/iperf-{{ item }}.tar.gz" @@ -25,7 +27,7 @@ tags: - install-iperf -- name: iPerf - Extract Release Archive +- name: Extract Release Archive unarchive: remote_src: true src: "{{ iperf_target_dir }}/iperf-{{ item }}.tar.gz" @@ -35,7 +37,7 @@ tags: - install-iperf -- name: iPerf - Compile Release I +- name: Compile Release I command: "./configure" args: chdir: "{{ iperf_target_dir }}/iperf-{{ item }}/" @@ -43,7 +45,7 @@ tags: - install-iperf -- name: iPerf - Compile Release II +- name: Compile Release II command: "make" args: chdir: "{{ iperf_target_dir }}/iperf-{{ item }}/" @@ -51,7 +53,7 @@ tags: - install-iperf -- name: iPerf - Compile Release III +- name: Compile Release III command: "make install" args: chdir: "{{ iperf_target_dir }}/iperf-{{ item }}/" diff --git a/resources/tools/testbed-setup/ansible/roles/mellanox/tasks/main.yaml b/resources/tools/testbed-setup/ansible/roles/mellanox/tasks/main.yaml index f677836702..1b81215a73 100644 --- a/resources/tools/testbed-setup/ansible/roles/mellanox/tasks/main.yaml +++ b/resources/tools/testbed-setup/ansible/roles/mellanox/tasks/main.yaml @@ -1,21 +1,23 @@ --- # file: roles/mellanox/tasks/main.yaml -- Name: Update package cache (apt) +- name: Inst - Update Package Cache (APT) apt: update_cache: yes cache_valid_time: 3600 when: - ansible_distribution|lower == 'ubuntu' + tags: + - mellanox-inst-prerequisites -- name: Mellanox Install - Install Distribution - Release - Machine Prerequisites +- name: Inst - Prerequisites package: name: "{{ packages | flatten(levels=1) }}" state: latest tags: - - install-dependencies + - mellanox-inst-prerequisites -- name: Mellanox Install - Check Presence of Mellanox Hardware +- name: Check Presence of Mellanox Hardware shell: "lspci | grep Mellanox | awk '{print $1}'" register: mellanox_pcis failed_when: no @@ -23,7 +25,7 @@ tags: - install-mellanox -- name: Mellanox Install - Get OFED +- name: Get OFED get_url: url: "http://content.mellanox.com/ofed/MLNX_OFED-{{ mellanox_version }}/MLNX_OFED_LINUX-{{ mellanox_version }}-{{ ansible_distribution|lower }}{{ ansible_distribution_version }}-{{ ansible_machine }}.tgz" dest: "/opt/MLNX_OFED_LINUX-{{ mellanox_version }}-{{ ansible_distribution|lower }}{{ ansible_distribution_version }}-{{ ansible_machine }}.tgz" @@ -32,7 +34,7 @@ tags: - install-mellanox -- name: Mellanox Install - Extract OFED +- name: Extract OFED unarchive: remote_src: true src: "/opt/MLNX_OFED_LINUX-{{ mellanox_version }}-{{ ansible_distribution|lower }}{{ ansible_distribution_version }}-{{ ansible_machine }}.tgz" @@ -43,7 +45,7 @@ tags: - install-mellanox -- name: Mellanox Install - Install OFED +- name: Install OFED command: "./mlnxofedinstall --with-mft --dpdk --force --upstream-libs" args: chdir: "/opt/MLNX_OFED_LINUX-{{ mellanox_version }}-{{ ansible_distribution|lower }}{{ ansible_distribution_version }}-{{ ansible_machine }}" @@ -51,13 +53,13 @@ tags: - install-mellanox -- name: Mellanox Install - Switch Infiniband to Ethernet +- name: Switch Infiniband to Ethernet command: "mlxconfig --yes --dev {{ item }} set LINK_TYPE_P1=2 LINK_TYPE_P2=2" with_items: "{{ mellanox_pcis.stdout_lines }}" tags: - install-mellanox -- name: Mellanox Install - FIX qemu-system removal +- name: FIX qemu-system removal package: name: "qemu-system" state: latest diff --git a/resources/tools/testbed-setup/ansible/roles/nomad/tasks/main.yaml b/resources/tools/testbed-setup/ansible/roles/nomad/tasks/main.yaml index 5b323054d4..54e80513b8 100644 --- a/resources/tools/testbed-setup/ansible/roles/nomad/tasks/main.yaml +++ b/resources/tools/testbed-setup/ansible/roles/nomad/tasks/main.yaml @@ -1,12 +1,14 @@ --- # file: roles/nomad/tasks/main.yaml -- Name: Update package cache (apt) +- name: Inst - Update Package Cache (APT) apt: update_cache: yes cache_valid_time: 3600 when: - ansible_distribution|lower == 'ubuntu' + tags: + - nomad-inst-prerequisites - name: Inst - Prerequisites package: diff --git a/resources/tools/testbed-setup/ansible/roles/performance_tuning/tasks/main.yaml b/resources/tools/testbed-setup/ansible/roles/performance_tuning/tasks/main.yaml index 608d3e2505..3c22892483 100644 --- a/resources/tools/testbed-setup/ansible/roles/performance_tuning/tasks/main.yaml +++ b/resources/tools/testbed-setup/ansible/roles/performance_tuning/tasks/main.yaml @@ -1,26 +1,28 @@ --- # file: roles/performance_tuning/tasks/main.yaml -- Name: Update package cache (apt) +- name: Inst - Update Package Cache (APT) apt: update_cache: yes cache_valid_time: 3600 when: - ansible_distribution|lower == 'ubuntu' + tags: + - perf-inst-prerequisites -- name: Performance Tuning - Install Distribution - Release - Machine Prerequisites +- name: Inst - Machine Prerequisites package: name: "{{ packages | flatten(levels=1) }}" state: latest tags: - - install-dependencies + - perf-inst-prerequisites -- name: Performance Tuning - Distribution - release - machine optimizations +- name: Distribution - release - machine optimizations include_tasks: '{{ ansible_distribution|lower }}_{{ ansible_distribution_release }}.yaml' tags: - machine-optimizations -- name: Performance Tuning - Configure {{ ansible_machine }} Kernel Parameters +- name: Configure {{ ansible_machine }} Kernel Parameters lineinfile: path: "/etc/default/grub" state: "present" @@ -33,7 +35,7 @@ - meta: flush_handlers -- name: Performance Tuning - Turbo Boost +- name: Turbo Boost import_tasks: turbo_boost.yaml when: > cpu_microarchitecture == "skylake" or @@ -41,7 +43,7 @@ tags: - turbo-boost -- name: Performance Tuning - Adjust nr_hugepages +- name: Adjust nr_hugepages # change the minimum size of the hugepage pool. # 2G VPP, 4GB per VNF/CNF, 2G reserve sysctl: @@ -53,7 +55,7 @@ tags: - set-sysctl -- name: Performance Tuning - Adjust max_map_count +- name: Adjust max_map_count # this file contains the maximum number of memory map areas a process # may have. memory map areas are used as a side-effect of calling # malloc, directly by mmap and mprotect, and also when loading shared @@ -72,7 +74,7 @@ tags: - set-sysctl -- name: Performance Tuning - Adjust hugetlb_shm_group +- name: Adjust hugetlb_shm_group # hugetlb_shm_group contains group id that is allowed to create sysv # shared memory segment using hugetlb page. sysctl: @@ -84,7 +86,7 @@ tags: - set-sysctl -- name: Performance Tuning - Adjust swappiness +- name: Adjust swappiness # this control is used to define how aggressive the kernel will swap # memory pages. higher values will increase agressiveness, lower values # decrease the amount of swap. a value of 0 instructs the kernel not to @@ -99,7 +101,7 @@ tags: - set-sysctl -- name: Performance Tuning - Adjust shmmax +- name: Adjust shmmax # shared memory max must be greator or equal to the total size of hugepages. # for 2mb pages, totalhugepagesize = vm.nr_hugepages * 2 * 1024 * 1024 # if the existing kernel.shmmax setting (cat /sys/proc/kernel/shmmax) @@ -114,7 +116,7 @@ tags: - set-sysctl -- name: Performance Tuning - Adjust watchdog_cpumask +- name: Adjust watchdog_cpumask # this value can be used to control on which cpus the watchdog may run. # the default cpumask is all possible cores, but if no_hz_full is # enabled in the kernel config, and cores are specified with the @@ -134,7 +136,7 @@ tags: - set-sysctl -- name: Performance Tuning - Adjust randomize_va_space +- name: Adjust randomize_va_space # this option can be used to select the type of process address # space randomization that is used in the system, for architectures # that support this feature. @@ -150,7 +152,7 @@ tags: - set-sysctl -- name: Performance Tuning - Copy Cpufrequtils File +- name: Copy Cpufrequtils File copy: src: "files/cpufrequtils" dest: "/etc/default/cpufrequtils" @@ -160,7 +162,7 @@ tags: - copy-cpufrequtils -- name: Performance Tuning - Copy Irqbalance File +- name: Copy Irqbalance File template: src: "files/irqbalance" dest: "/etc/default/irqbalance" @@ -172,14 +174,14 @@ tags: - copy-irqbalance -- name: Performance Tuning - Set Ondemand Service To Disable +- name: Set Ondemand Service To Disable service: name: "ondemand" enabled: "no" tags: - set-ondemand -- name: Performance Tuning - Load Kernel Modules By Default +- name: Load Kernel Modules By Default lineinfile: path: "/etc/modules" state: "present" diff --git a/resources/tools/testbed-setup/ansible/roles/performance_tuning/tasks/turbo_boost.yaml b/resources/tools/testbed-setup/ansible/roles/performance_tuning/tasks/turbo_boost.yaml index 23cca0566e..11f0e326d9 100644 --- a/resources/tools/testbed-setup/ansible/roles/performance_tuning/tasks/turbo_boost.yaml +++ b/resources/tools/testbed-setup/ansible/roles/performance_tuning/tasks/turbo_boost.yaml @@ -1,22 +1,24 @@ --- # file: roles/performance_tuning/tasks/turbo_boost.yaml -- Name: Update package cache (apt) +- name: Inst - Update Package Cache (APT) apt: update_cache: yes cache_valid_time: 3600 when: - ansible_distribution|lower == 'ubuntu' + tags: + - turbo-inst-prerequisites -- name: Turbo Boost - Install msr-tools +- name: Install msr-tools package: name: - "msr-tools" state: latest tags: - - turbo-boost + - turbo-inst-prerequisites -- name: Turbo Boost - Load msr By Default +- name: Load msr By Default lineinfile: path: "/etc/modules" state: "present" @@ -24,7 +26,7 @@ tags: - turbo-boost -- name: Turbo Boost - Custom Startup Service Hook +- name: Custom Startup Service Hook copy: src: "files/disable-turbo-boost.service" dest: "/etc/systemd/system/disable-turbo-boost.service" @@ -34,7 +36,7 @@ tags: - turbo-boost -- name: Turbo Boost - Custom Startup Service Hook Enable +- name: Custom Startup Service Hook Enable service: name: "disable-turbo-boost" enabled: yes diff --git a/resources/tools/testbed-setup/ansible/roles/prometheus_exporter/defaults/main.yaml b/resources/tools/testbed-setup/ansible/roles/prometheus_exporter/defaults/main.yaml new file mode 100644 index 0000000000..eb2b94cb26 --- /dev/null +++ b/resources/tools/testbed-setup/ansible/roles/prometheus_exporter/defaults/main.yaml @@ -0,0 +1,17 @@ +--- +# file: roles/prometheus_exporter/defaults/main.yaml + +# Inst - Exporters. +ne_packages: "{{ ne_packages_by_distro[ansible_distribution | lower][ansible_machine] }}" + +ne_packages_by_distro: + ubuntu: + aarch64: "http://ports.ubuntu.com/pool/universe/p/prometheus-node-exporter/prometheus-node-exporter_1.0.1+ds-1_arm64.deb" + x86_64: "http://archive.ubuntu.com/ubuntu/pool/universe/p/prometheus-node-exporter/prometheus-node-exporter_1.0.1+ds-1_amd64.deb" + +be_packages: "{{ be_packages_by_distro[ansible_distribution | lower][ansible_machine] }}" + +be_packages_by_distro: + ubuntu: + aarch64: "http://ports.ubuntu.com/pool/universe/p/prometheus-blackbox-exporter/prometheus-blackbox-exporter_0.17.0+ds-1_arm64.deb" + x86_64: "http://archive.ubuntu.com/ubuntu/pool/universe/p/prometheus-blackbox-exporter/prometheus-blackbox-exporter_0.17.0+ds-1_amd64.deb" diff --git a/resources/tools/testbed-setup/ansible/roles/prometheus_exporter/files/blackbox.yml b/resources/tools/testbed-setup/ansible/roles/prometheus_exporter/files/blackbox.yml new file mode 100644 index 0000000000..f61c26e1a8 --- /dev/null +++ b/resources/tools/testbed-setup/ansible/roles/prometheus_exporter/files/blackbox.yml @@ -0,0 +1,25 @@ +modules: + http_2xx: + prober: http + timeout: 5s + http: + valid_http_versions: ["HTTP/1.1", "HTTP/2.0"] + no_follow_redirects: false + fail_if_ssl: false + fail_if_not_ssl: true + tls_config: + insecure_skip_verify: false + preferred_ip_protocol: "ip4" + icmp_v4: + prober: icmp + timeout: 5s + icmp: + preferred_ip_protocol: "ip4" + dns_udp: + prober: dns + timeout: 5s + dns: + query_name: "jenkins.fd.io" + query_type: "A" + valid_rcodes: + - NOERROR
\ No newline at end of file diff --git a/resources/tools/testbed-setup/ansible/roles/prometheus_exporter/handlers/main.yaml b/resources/tools/testbed-setup/ansible/roles/prometheus_exporter/handlers/main.yaml new file mode 100644 index 0000000000..9c374eaa61 --- /dev/null +++ b/resources/tools/testbed-setup/ansible/roles/prometheus_exporter/handlers/main.yaml @@ -0,0 +1,16 @@ +--- +# file roles/prometheus_exporter/handlers/main.yaml + +- name: Restart Prometheus Node Exporter + systemd: + daemon_reload: true + enabled: true + name: "prometheus-node-exporter" + state: "restarted" + +- name: Restart Prometheus Blackbox Exporter + systemd: + daemon_reload: true + enabled: true + name: "prometheus-blackbox-exporter" + state: "restarted"
\ No newline at end of file diff --git a/resources/tools/testbed-setup/ansible/roles/prometheus_exporter/tasks/main.yaml b/resources/tools/testbed-setup/ansible/roles/prometheus_exporter/tasks/main.yaml new file mode 100644 index 0000000000..b38215c4a2 --- /dev/null +++ b/resources/tools/testbed-setup/ansible/roles/prometheus_exporter/tasks/main.yaml @@ -0,0 +1,15 @@ +--- +# file: roles/prometheus_exporter/tasks/main.yaml + +- include_tasks: "{{ ansible_distribution|lower }}_{{ ansible_distribution_release }}.yaml" + tags: + - prometheus-inst + +- name: Conf - Prometheus Blackbox Exporter + copy: + src: 'files/blackbox.yml' + dest: '/etc/prometheus/blackbox.yml' + notify: + - "Restart Prometheus Blackbox Exporter" + tags: + - prometheus-conf-blackbox-exporter
\ No newline at end of file diff --git a/resources/tools/testbed-setup/ansible/roles/prometheus_exporter/tasks/ubuntu_bionic.yaml b/resources/tools/testbed-setup/ansible/roles/prometheus_exporter/tasks/ubuntu_bionic.yaml new file mode 100644 index 0000000000..566753e272 --- /dev/null +++ b/resources/tools/testbed-setup/ansible/roles/prometheus_exporter/tasks/ubuntu_bionic.yaml @@ -0,0 +1,33 @@ +--- +# file: roles/prometheus_exporter/tasks/ubuntu_bionic.yaml + +- name: Inst - Update Package Cache (APT) + apt: + update_cache: yes + cache_valid_time: 3600 + tags: + - prometheus-inst-prerequisites + +- name: Inst - Prerequisites + package: + name: "init-system-helpers" + default_release: "bionic-backports" + state: latest + tags: + - prometheus-inst-prerequisites + +- name: Inst - Prometheus Node Exporter + apt: + deb: "{{ ne_packages }}" + notify: + - "Restart Prometheus Node Exporter" + tags: + - prometheus-inst-node-exporter + +- name: Inst - Prometheus Blackbox Exporter + apt: + deb: "{{ be_packages }}" + notify: + - "Restart Prometheus Blackbox Exporter" + tags: + - prometheus-inst-blackbox-exporter
\ No newline at end of file diff --git a/resources/tools/testbed-setup/ansible/roles/trex/tasks/main.yaml b/resources/tools/testbed-setup/ansible/roles/trex/tasks/main.yaml index 1e442f5a58..96a3ba5f03 100644 --- a/resources/tools/testbed-setup/ansible/roles/trex/tasks/main.yaml +++ b/resources/tools/testbed-setup/ansible/roles/trex/tasks/main.yaml @@ -1,19 +1,21 @@ --- # file: roles/trex/tasks/main.yaml -- Name: Update package cache (apt) +- name: Inst - Update Package Cache (APT) apt: update_cache: yes cache_valid_time: 3600 when: - ansible_distribution|lower == 'ubuntu' + tags: + - trex-inst-prerequisites -- name: Install Distribution - Release - Machine Prerequisites +- name: Inst - Prerequisites package: name: "{{ packages | flatten(levels=1) }}" state: latest tags: - - install-dependencies + - trex-inst-prerequisites - name: Deploy Multiple T-Rex Versions include_tasks: deploy_block.yaml diff --git a/resources/tools/testbed-setup/ansible/roles/vpp/tasks/main.yaml b/resources/tools/testbed-setup/ansible/roles/vpp/tasks/main.yaml index 06930b2479..6011917de2 100644 --- a/resources/tools/testbed-setup/ansible/roles/vpp/tasks/main.yaml +++ b/resources/tools/testbed-setup/ansible/roles/vpp/tasks/main.yaml @@ -1,21 +1,23 @@ --- -# file: roles/sut/tasks/main.yaml +# file: roles/vpp/tasks/main.yaml -- Name: Update package cache (apt) +- name: Inst - Update Package Cache (APT) apt: update_cache: yes cache_valid_time: 3600 when: - ansible_distribution|lower == 'ubuntu' + tags: + - vpp-inst-prerequisites -- name: SUT - Install Distribution - Release - Machine Prerequisites +- name: Inst - Prerequisites package: name: "{{ packages | flatten(levels=1) }}" state: latest tags: - - install-dependencies + - vpp-inst-prerequisites -- name: SUT - Install VPP 19.08 PIP requirements +- name: Inst - VPP 19.08 PIP requirements pip: name: - "aenum==2.1.2" @@ -23,7 +25,7 @@ tags: - install-pip -- name: SUT - Copy 80-vpp.conf +- name: Copy 80-vpp.conf file: src: "/dev/null" dest: "/etc/sysctl.d/80-vpp.conf" diff --git a/resources/tools/testbed-setup/ansible/vpp_device.yaml b/resources/tools/testbed-setup/ansible/vpp_device.yaml index ac42b8cafe..7dc3614bbc 100644 --- a/resources/tools/testbed-setup/ansible/vpp_device.yaml +++ b/resources/tools/testbed-setup/ansible/vpp_device.yaml @@ -18,6 +18,10 @@ tags: nomad - role: consul tags: consul + - role: prometheus_exporter + tags: prometheus_exporter + - role: cadvisor + tags: cadvisor - role: vpp_device tags: vpp_device - role: kernel_vm diff --git a/terraform-ci-infra/1n_nmd/alertmanager/conf/nomad/alertmanager.hcl b/terraform-ci-infra/1n_nmd/alertmanager/conf/nomad/alertmanager.hcl index b8a9df4985..fafe623c85 100644 --- a/terraform-ci-infra/1n_nmd/alertmanager/conf/nomad/alertmanager.hcl +++ b/terraform-ci-infra/1n_nmd/alertmanager/conf/nomad/alertmanager.hcl @@ -196,7 +196,7 @@ route: # alerts as-is. This is unlikely to be what you want, unless you have # a very low alert volume or your upstream notification system performs # its own grouping. Example: group_by: [...] - group_by: ['alertname', 'cluster', 'service'] + group_by: ['alertname'] # When a new group of alerts is created by an incoming alert, wait at # least 'group_wait' to send the initial notification. diff --git a/terraform-ci-infra/1n_nmd/exporter/conf/nomad/exporter.hcl b/terraform-ci-infra/1n_nmd/exporter/conf/nomad/exporter.hcl deleted file mode 100644 index 4fd0768ae7..0000000000 --- a/terraform-ci-infra/1n_nmd/exporter/conf/nomad/exporter.hcl +++ /dev/null @@ -1,587 +0,0 @@ -job "${job_name}" { - # The "region" parameter specifies the region in which to execute the job. - # If omitted, this inherits the default region name of "global". - # region = "global" - # - # The "datacenters" parameter specifies the list of datacenters which should - # be considered when placing this task. This must be provided. - datacenters = "${datacenters}" - - # The "type" parameter controls the type of job, which impacts the scheduler's - # decision on placement. This configuration is optional and defaults to - # "service". For a full list of job types and their differences, please see - # the online documentation. - # - # https://www.nomadproject.io/docs/jobspec/schedulers - # - type = "system" - - update { - # The "max_parallel" parameter specifies the maximum number of updates to - # perform in parallel. In this case, this specifies to update a single task - # at a time. - max_parallel = 1 - - health_check = "checks" - - # The "min_healthy_time" parameter specifies the minimum time the allocation - # must be in the healthy state before it is marked as healthy and unblocks - # further allocations from being updated. - min_healthy_time = "10s" - - # The "healthy_deadline" parameter specifies the deadline in which the - # allocation must be marked as healthy after which the allocation is - # automatically transitioned to unhealthy. Transitioning to unhealthy will - # fail the deployment and potentially roll back the job if "auto_revert" is - # set to true. - healthy_deadline = "3m" - - # The "progress_deadline" parameter specifies the deadline in which an - # allocation must be marked as healthy. The deadline begins when the first - # allocation for the deployment is created and is reset whenever an allocation - # as part of the deployment transitions to a healthy state. If no allocation - # transitions to the healthy state before the progress deadline, the - # deployment is marked as failed. - progress_deadline = "10m" - -%{ if use_canary } - # The "canary" parameter specifies that changes to the job that would result - # in destructive updates should create the specified number of canaries - # without stopping any previous allocations. Once the operator determines the - # canaries are healthy, they can be promoted which unblocks a rolling update - # of the remaining allocations at a rate of "max_parallel". - # - # Further, setting "canary" equal to the count of the task group allows - # blue/green deployments. When the job is updated, a full set of the new - # version is deployed and upon promotion the old version is stopped. - canary = 1 - - # Specifies if the job should auto-promote to the canary version when all - # canaries become healthy during a deployment. Defaults to false which means - # canaries must be manually updated with the nomad deployment promote - # command. - auto_promote = true - - # The "auto_revert" parameter specifies if the job should auto-revert to the - # last stable job on deployment failure. A job is marked as stable if all the - # allocations as part of its deployment were marked healthy. - auto_revert = true -%{ endif } - } - - # The "group" stanza defines a series of tasks that should be co-located on - # the same Nomad client. Any task within a group will be placed on the same - # client. - # - # https://www.nomadproject.io/docs/job-specification/group - # - group "prod-group1-exporter-amd64" { - # The constraint allows restricting the set of eligible nodes. Constraints - # may filter on attributes or client metadata. - # - # https://www.nomadproject.io/docs/job-specification/constraint - # - constraint { - attribute = "$${attr.cpu.arch}" - operator = "!=" - value = "arm64" - } - - # The "task" stanza creates an individual unit of work, such as a Docker - # container, web application, or batch processing. - # - # https://www.nomadproject.io/docs/job-specification/task - # - task "prod-task1-${node_service_name}-amd64" { - # The "driver" parameter specifies the task driver that should be used to - # run the task. - driver = "raw_exec" - - # The "config" stanza specifies the driver configuration, which is passed - # directly to the driver to start the task. The details of configurations - # are specific to each driver, so please see specific driver - # documentation for more information. - config { - command = "local/node_exporter-${node_version}.linux-amd64/node_exporter" - } - - # The artifact stanza instructs Nomad to fetch and unpack a remote resource, - # such as a file, tarball, or binary. Nomad downloads artifacts using the - # popular go-getter library, which permits downloading artifacts from a - # variety of locations using a URL as the input source. - # - # https://www.nomadproject.io/docs/job-specification/artifact - # - artifact { - source = "${node_url_amd64}" - } - - # The service stanza instructs Nomad to register a service with Consul. - # - # https://www.nomadproject.io/docs/job-specification/service - # - service { - name = "${node_service_name}" - port = "${node_service_name}" - check { - name = "Node Exporter Check Live" - type = "http" - path = "/metrics" - interval = "10s" - timeout = "2s" - } - } - - # The "resources" stanza describes the requirements a task needs to - # execute. Resource requirements include memory, network, cpu, and more. - # This ensures the task will execute on a machine that contains enough - # resource capacity. - # - # https://www.nomadproject.io/docs/job-specification/resources - # - resources { - cpu = 500 - # The network stanza specifies the networking requirements for the task - # group, including the network mode and port allocations. When scheduling - # jobs in Nomad they are provisioned across your fleet of machines along - # with other jobs and services. Because you don't know in advance what host - # your job will be provisioned on, Nomad will provide your tasks with - # network configuration when they start up. - # - # https://www.nomadproject.io/docs/job-specification/network - # - network { - port "${node_service_name}" { - static = ${node_port} - } - } - } - } - task "prod-task2-${blackbox_service_name}-amd64" { - # The "driver" parameter specifies the task driver that should be used to - # run the task. - driver = "exec" - - # The "config" stanza specifies the driver configuration, which is passed - # directly to the driver to start the task. The details of configurations - # are specific to each driver, so please see specific driver - # documentation for more information. - config { - command = "local/blackbox_exporter-${blackbox_version}.linux-amd64/blackbox_exporter" - args = [ - "--config.file=secrets/blackbox.yml" - ] - } - - # The "template" stanza instructs Nomad to manage a template, such as - # a configuration file or script. This template can optionally pull data - # from Consul or Vault to populate runtime configuration data. - # - # https://www.nomadproject.io/docs/job-specification/template - # - template { - change_mode = "noop" - change_signal = "SIGINT" - destination = "secrets/blackbox.yml" - data = <<EOH -modules: - http_2xx: - prober: http - timeout: 5s - http: - valid_http_versions: ["HTTP/1.1", "HTTP/2.0"] - no_follow_redirects: false - fail_if_ssl: false - fail_if_not_ssl: true - tls_config: - insecure_skip_verify: false - preferred_ip_protocol: "ip4" - icmp_v4: - prober: icmp - timeout: 5s - icmp: - preferred_ip_protocol: "ip4" - dns_udp: - prober: dns - timeout: 5s - dns: - query_name: "jenkins.fd.io" - query_type: "A" - valid_rcodes: - - NOERROR -EOH - } - - # The artifact stanza instructs Nomad to fetch and unpack a remote resource, - # such as a file, tarball, or binary. Nomad downloads artifacts using the - # popular go-getter library, which permits downloading artifacts from a - # variety of locations using a URL as the input source. - # - # https://www.nomadproject.io/docs/job-specification/artifact - # - artifact { - source = "${blackbox_url_amd64}" - } - - # The service stanza instructs Nomad to register a service with Consul. - # - # https://www.nomadproject.io/docs/job-specification/service - # - service { - name = "${blackbox_service_name}" - port = "${blackbox_service_name}" - tags = [ "${blackbox_service_name}$${NOMAD_ALLOC_INDEX}" ] - check { - name = "Blackbox Exporter Check Live" - type = "http" - path = "/metrics" - interval = "10s" - timeout = "2s" - } - } - - # The "resources" stanza describes the requirements a task needs to - # execute. Resource requirements include memory, network, cpu, and more. - # This ensures the task will execute on a machine that contains enough - # resource capacity. - # - # https://www.nomadproject.io/docs/job-specification/resources - # - resources { - cpu = 500 - # The network stanza specifies the networking requirements for the task - # group, including the network mode and port allocations. When scheduling - # jobs in Nomad they are provisioned across your fleet of machines along - # with other jobs and services. Because you don't know in advance what host - # your job will be provisioned on, Nomad will provide your tasks with - # network configuration when they start up. - # - # https://www.nomadproject.io/docs/job-specification/network - # - network { - port "${blackbox_service_name}" { - static = ${blackbox_port} - } - } - } - } - - task "prod-task3-${cadvisor_service_name}-amd64" { - # The "driver" parameter specifies the task driver that should be used to - # run the task. - driver = "docker" - - # The "config" stanza specifies the driver configuration, which is passed - # directly to the driver to start the task. The details of configurations - # are specific to each driver, so please see specific driver - # documentation for more information. - config { - image = "${cadvisor_image}" - volumes = [ - "/:/rootfs:ro", - "/var/run:/var/run:rw", - "/sys:/sys:ro", - "/var/lib/docker/:/var/lib/docker:ro", - "/cgroup:/cgroup:ro" - ] - } - - # The service stanza instructs Nomad to register a service with Consul. - # - # https://www.nomadproject.io/docs/job-specification/service - # - service { - name = "${cadvisor_service_name}" - port = "${cadvisor_service_name}" - check { - name = "cAdvisor Check Live" - type = "http" - path = "/metrics" - interval = "10s" - timeout = "2s" - } - } - - # The "resources" stanza describes the requirements a task needs to - # execute. Resource requirements include memory, network, cpu, and more. - # This ensures the task will execute on a machine that contains enough - # resource capacity. - # - # https://www.nomadproject.io/docs/job-specification/resources - # - resources { - cpu = 500 - # The network stanza specifies the networking requirements for the task - # group, including the network mode and port allocations. When scheduling - # jobs in Nomad they are provisioned across your fleet of machines along - # with other jobs and services. Because you don't know in advance what host - # your job will be provisioned on, Nomad will provide your tasks with - # network configuration when they start up. - # - # https://www.nomadproject.io/docs/job-specification/network - # - network { - port "${cadvisor_service_name}" { - static = ${cadvisor_port} - } - } - } - } - } - - group "prod-group1-exporter-arm64" { - # The constraint allows restricting the set of eligible nodes. Constraints - # may filter on attributes or client metadata. - # - # https://www.nomadproject.io/docs/job-specification/constraint - # - constraint { - attribute = "$${attr.cpu.arch}" - operator = "==" - value = "arm64" - } - - # The "task" stanza creates an individual unit of work, such as a Docker - # container, web application, or batch processing. - # - # https://www.nomadproject.io/docs/job-specification/task - # - task "prod-task1-${node_service_name}-arm64" { - # The "driver" parameter specifies the task driver that should be used to - # run the task. - driver = "raw_exec" - - # The "config" stanza specifies the driver configuration, which is passed - # directly to the driver to start the task. The details of configurations - # are specific to each driver, so please see specific driver - # documentation for more information. - config { - command = "local/node_exporter-${node_version}.linux-arm64/node_exporter" - } - - # The artifact stanza instructs Nomad to fetch and unpack a remote resource, - # such as a file, tarball, or binary. Nomad downloads artifacts using the - # popular go-getter library, which permits downloading artifacts from a - # variety of locations using a URL as the input source. - # - # https://www.nomadproject.io/docs/job-specification/artifact - # - artifact { - source = "${node_url_arm64}" - } - - # The service stanza instructs Nomad to register a service with Consul. - # - # https://www.nomadproject.io/docs/job-specification/service - # - service { - name = "${node_service_name}" - port = "${node_service_name}" - check { - name = "Node Exporter Check Live" - type = "http" - path = "/metrics" - interval = "10s" - timeout = "2s" - } - } - - # The "resources" stanza describes the requirements a task needs to - # execute. Resource requirements include memory, network, cpu, and more. - # This ensures the task will execute on a machine that contains enough - # resource capacity. - # - # https://www.nomadproject.io/docs/job-specification/resources - # - resources { - cpu = 500 - # The network stanza specifies the networking requirements for the task - # group, including the network mode and port allocations. When scheduling - # jobs in Nomad they are provisioned across your fleet of machines along - # with other jobs and services. Because you don't know in advance what host - # your job will be provisioned on, Nomad will provide your tasks with - # network configuration when they start up. - # - # https://www.nomadproject.io/docs/job-specification/network - # - network { - port "${node_service_name}" { - static = ${node_port} - } - } - } - } - - task "prod-task2-${blackbox_service_name}-arm64" { - # The "driver" parameter specifies the task driver that should be used to - # run the task. - driver = "exec" - - # The "config" stanza specifies the driver configuration, which is passed - # directly to the driver to start the task. The details of configurations - # are specific to each driver, so please see specific driver - # documentation for more information. - config { - command = "local/blackbox_exporter-${blackbox_version}.linux-arm64/blackbox_exporter" - args = [ - "--config.file=secrets/blackbox.yml" - ] - } - - # The "template" stanza instructs Nomad to manage a template, such as - # a configuration file or script. This template can optionally pull data - # from Consul or Vault to populate runtime configuration data. - # - # https://www.nomadproject.io/docs/job-specification/template - # - template { - change_mode = "noop" - change_signal = "SIGINT" - destination = "secrets/blackbox.yml" - data = <<EOH -modules: - http_2xx: - prober: http - timeout: 5s - http: - valid_http_versions: ["HTTP/1.1", "HTTP/2.0"] - no_follow_redirects: false - fail_if_ssl: false - fail_if_not_ssl: true - tls_config: - insecure_skip_verify: false - preferred_ip_protocol: "ip4" - icmp_v4: - prober: icmp - timeout: 5s - icmp: - preferred_ip_protocol: "ip4" - dns_udp: - prober: dns - timeout: 5s - dns: - query_name: "jenkins.fd.io" - query_type: "A" - valid_rcodes: - - NOERROR -EOH - } - - # The artifact stanza instructs Nomad to fetch and unpack a remote resource, - # such as a file, tarball, or binary. Nomad downloads artifacts using the - # popular go-getter library, which permits downloading artifacts from a - # variety of locations using a URL as the input source. - # - # https://www.nomadproject.io/docs/job-specification/artifact - # - artifact { - source = "${blackbox_url_arm64}" - } - - # The service stanza instructs Nomad to register a service with Consul. - # - # https://www.nomadproject.io/docs/job-specification/service - # - service { - name = "${blackbox_service_name}" - port = "${blackbox_service_name}" - tags = [ "${blackbox_service_name}$${NOMAD_ALLOC_INDEX}" ] - check { - name = "Blackbox Exporter Check Live" - type = "http" - path = "/metrics" - interval = "10s" - timeout = "2s" - } - } - - # The "resources" stanza describes the requirements a task needs to - # execute. Resource requirements include memory, network, cpu, and more. - # This ensures the task will execute on a machine that contains enough - # resource capacity. - # - # https://www.nomadproject.io/docs/job-specification/resources - # - resources { - cpu = 500 - # The network stanza specifies the networking requirements for the task - # group, including the network mode and port allocations. When scheduling - # jobs in Nomad they are provisioned across your fleet of machines along - # with other jobs and services. Because you don't know in advance what host - # your job will be provisioned on, Nomad will provide your tasks with - # network configuration when they start up. - # - # https://www.nomadproject.io/docs/job-specification/network - # - network { - port "${blackbox_service_name}" { - static = ${blackbox_port} - } - } - } - } - - task "prod-task3-${cadvisor_service_name}-arm64" { - # The "driver" parameter specifies the task driver that should be used to - # run the task. - driver = "docker" - - # The "config" stanza specifies the driver configuration, which is passed - # directly to the driver to start the task. The details of configurations - # are specific to each driver, so please see specific driver - # documentation for more information. - config { - # There is currently no official release for arm yet...using community. - image = "zcube/cadvisor:latest" - volumes = [ - "/:/rootfs:ro", - "/var/run:/var/run:rw", - "/sys:/sys:ro", - "/var/lib/docker/:/var/lib/docker:ro", - "/cgroup:/cgroup:ro" - ] - } - - # The service stanza instructs Nomad to register a service with Consul. - # - # https://www.nomadproject.io/docs/job-specification/service - # - service { - name = "${cadvisor_service_name}" - port = "${cadvisor_service_name}" - check { - name = "cAdvisor Check Live" - type = "http" - path = "/metrics" - interval = "10s" - timeout = "2s" - } - } - - # The "resources" stanza describes the requirements a task needs to - # execute. Resource requirements include memory, network, cpu, and more. - # This ensures the task will execute on a machine that contains enough - # resource capacity. - # - # https://www.nomadproject.io/docs/job-specification/resources - # - resources { - cpu = 500 - # The network stanza specifies the networking requirements for the task - # group, including the network mode and port allocations. When scheduling - # jobs in Nomad they are provisioned across your fleet of machines along - # with other jobs and services. Because you don't know in advance what host - # your job will be provisioned on, Nomad will provide your tasks with - # network configuration when they start up. - # - # https://www.nomadproject.io/docs/job-specification/network - # - network { - port "${cadvisor_service_name}" { - static = ${cadvisor_port} - } - } - } - } - } -}
\ No newline at end of file diff --git a/terraform-ci-infra/1n_nmd/exporter/main.tf b/terraform-ci-infra/1n_nmd/exporter/main.tf deleted file mode 100644 index 35eb95b071..0000000000 --- a/terraform-ci-infra/1n_nmd/exporter/main.tf +++ /dev/null @@ -1,64 +0,0 @@ -locals { - datacenters = join(",", var.nomad_datacenters) - - node_url_amd64 = join("", - [ - "https://github.com", - "/prometheus/node_exporter/releases/download/", - "v${var.node_version}/", - "node_exporter-${var.node_version}.linux-amd64.tar.gz" - ] - ) - node_url_arm64 = join("", - [ - "https://github.com", - "/prometheus/node_exporter/releases/download/", - "v${var.node_version}/", - "node_exporter-${var.node_version}.linux-arm64.tar.gz" - ] - ) - - blackbox_url_amd64 = join("", - [ - "https://github.com", - "/prometheus/blackbox_exporter/releases/download/", - "v${var.blackbox_version}/", - "blackbox_exporter-${var.blackbox_version}.linux-amd64.tar.gz" - ] - ) - blackbox_url_arm64 = join("", - [ - "https://github.com", - "/prometheus/blackbox_exporter/releases/download/", - "v${var.blackbox_version}/", - "blackbox_exporter-${var.blackbox_version}.linux-arm64.tar.gz" - ] - ) -} - -data "template_file" "nomad_job_exporter" { - template = file("${path.module}/conf/nomad/exporter.hcl") - vars = { - datacenters = local.datacenters - job_name = var.exporter_job_name - use_canary = var.exporter_use_canary - node_url_amd64 = local.node_url_amd64 - node_url_arm64 = local.node_url_arm64 - node_version = var.node_version - node_service_name = var.node_service_name - node_port = var.node_port - blackbox_url_amd64 = local.blackbox_url_amd64 - blackbox_url_arm64 = local.blackbox_url_arm64 - blackbox_version = var.blackbox_version - blackbox_service_name = var.blackbox_service_name - blackbox_port = var.blackbox_port - cadvisor_image = var.cadvisor_image - cadvisor_service_name = var.cadvisor_service_name - cadvisor_port = var.cadvisor_port - } -} - -resource "nomad_job" "nomad_job_exporter" { - jobspec = data.template_file.nomad_job_exporter.rendered - detach = false -}
\ No newline at end of file diff --git a/terraform-ci-infra/1n_nmd/exporter/variables.tf b/terraform-ci-infra/1n_nmd/exporter/variables.tf deleted file mode 100644 index bfa8bd37ac..0000000000 --- a/terraform-ci-infra/1n_nmd/exporter/variables.tf +++ /dev/null @@ -1,76 +0,0 @@ -# Nomad -variable "nomad_datacenters" { - description = "Nomad data centers" - type = list(string) - default = [ "dc1" ] -} - -# Exporter -variable "exporter_job_name" { - description = "Exporter job name" - type = string - default = "exporter" -} - -variable "exporter_use_canary" { - description = "Uses canary deployment" - type = bool - default = false -} - -# Node Exporter -variable "node_service_name" { - description = "Node exporter service name" - type = string - default = "nodeexporter" -} - -variable "node_version" { - description = "Node exporter version" - type = string - default = "1.0.1" -} - -variable "node_port" { - description = "Node exporter TCP allocation" - type = number - default = 9100 -} - -# Blackbox Exporter -variable "blackbox_service_name" { - description = "Blackbox exporter service name" - type = string - default = "blackboxexporter" -} - -variable "blackbox_version" { - description = "Blackbox exporter version" - type = string - default = "0.18.0" -} - -variable "blackbox_port" { - description = "Blackbox exporter TCP allocation" - type = number - default = 9115 -} - -# cAdvisor Exporter -variable "cadvisor_service_name" { - description = "cAdvisor exporter service name" - type = string - default = "cadvisorexporter" -} - -variable "cadvisor_image" { - description = "cAdvisor exporter docker image" - type = string - default = "gcr.io/cadvisor/cadvisor:v0.38.7" -} - -variable "cadvisor_port" { - description = "cAdvisor exporter TCP allocation" - type = number - default = 8080 -}
\ No newline at end of file diff --git a/terraform-ci-infra/1n_nmd/main.tf b/terraform-ci-infra/1n_nmd/main.tf index fdf3801a35..985f9ae7c6 100644 --- a/terraform-ci-infra/1n_nmd/main.tf +++ b/terraform-ci-infra/1n_nmd/main.tf @@ -32,32 +32,6 @@ module "alertmanager" { alertmanager_slack_channel = "fdio-infra-monitoring" } -module "exporter" { - source = "./exporter" - providers = { - nomad = nomad.yul1 - } - - # nomad - nomad_datacenters = [ "yul1" ] - - # exporter - exporter_job_name = "prod-exporter" - exporter_use_canary = false - - # node - node_version = "1.0.1" - node_port = 9100 - - # blackbox - blackbox_version = "0.18.0" - blackbox_port = 9115 - - # cadvisor - cadvisor_image = "gcr.io/cadvisor/cadvisor:latest" - cadvisor_port = 8080 -} - module "grafana" { source = "./grafana" providers = { diff --git a/terraform-ci-infra/1n_nmd/prometheus/conf/nomad/prometheus.hcl b/terraform-ci-infra/1n_nmd/prometheus/conf/nomad/prometheus.hcl index 2d74662f31..4918a5f5bd 100644 --- a/terraform-ci-infra/1n_nmd/prometheus/conf/nomad/prometheus.hcl +++ b/terraform-ci-infra/1n_nmd/prometheus/conf/nomad/prometheus.hcl @@ -225,13 +225,13 @@ groups: summary: "Prometheus target missing (instance {{ $labels.instance }})." description: "A Prometheus target has disappeared. An exporter might be crashed." - alert: HostHighCpuLoad - expr: 100 - (avg by(instance) (rate(node_cpu_seconds_total{mode="idle"}[2m])) * 100) > 80 + expr: 100 - (avg by(instance) (rate(node_cpu_seconds_total{mode="idle"}[5m])) * 100) > 95 for: 0m labels: severity: warning annotations: summary: "Host high CPU load (instance {{ $labels.instance }})." - description: "CPU load is > 80%." + description: "CPU load is > 95%." - alert: HostOutOfMemory expr: node_memory_MemAvailable_bytes / node_memory_MemTotal_bytes * 100 < 10 for: 2m @@ -443,16 +443,31 @@ scrape_configs: - job_name: 'Consul Cluster' static_configs: - - targets: [ '10.30.51.30:8500', '10.30.51.32:8500', '10.30.51.33:8500' ] + - targets: [ '10.30.51.28:8500' ] + - targets: [ '10.30.51.29:8500' ] + - targets: [ '10.30.51.30:8500' ] + - targets: [ '10.30.51.32:8500' ] + - targets: [ '10.30.51.33:8500' ] + - targets: [ '10.30.51.34:8500' ] + - targets: [ '10.30.51.35:8500' ] + - targets: [ '10.30.51.39:8500' ] + - targets: [ '10.30.51.40:8500' ] + - targets: [ '10.30.51.50:8500' ] + - targets: [ '10.30.51.51:8500' ] + - targets: [ '10.30.51.65:8500' ] + - targets: [ '10.30.51.66:8500' ] + - targets: [ '10.30.51.67:8500' ] + - targets: [ '10.30.51.68:8500' ] + - targets: [ '10.30.51.70:8500' ] + - targets: [ '10.30.51.71:8500' ] + - targets: [ '10.32.8.14:8500' ] + - targets: [ '10.32.8.15:8500' ] + - targets: [ '10.32.8.16:8500' ] + - targets: [ '10.32.8.17:8500' ] metrics_path: /v1/agent/metrics params: format: [ 'prometheus' ] - - job_name: 'Alertmanager' - consul_sd_configs: - - server: '{{ env "NOMAD_IP_prometheus" }}:8500' - services: [ 'alertmanager' ] - - job_name: 'Blackbox Exporter (icmp)' static_configs: - targets: [ 'gerrit.fd.io' ] @@ -485,20 +500,63 @@ scrape_configs: metrics_path: /probe - job_name: 'cAdvisor Exporter' + static_configs: + - targets: [ '10.30.51.28:8080' ] + - targets: [ '10.30.51.29:8080' ] + - targets: [ '10.30.51.30:8080' ] + #- targets: [ '10.30.51.32:8080' ] + - targets: [ '10.30.51.33:8080' ] + - targets: [ '10.30.51.34:8080' ] + - targets: [ '10.30.51.35:8080' ] + - targets: [ '10.30.51.39:8080' ] + - targets: [ '10.30.51.40:8080' ] + - targets: [ '10.30.51.50:8080' ] + - targets: [ '10.30.51.51:8080' ] + - targets: [ '10.30.51.65:8080' ] + - targets: [ '10.30.51.66:8080' ] + - targets: [ '10.30.51.67:8080' ] + - targets: [ '10.30.51.68:8080' ] + - targets: [ '10.30.51.70:8080' ] + - targets: [ '10.30.51.71:8080' ] + - targets: [ '10.32.8.14:8080' ] + - targets: [ '10.32.8.15:8080' ] + - targets: [ '10.32.8.16:8080' ] + - targets: [ '10.32.8.17:8080' ] + + - job_name: 'Node Exporter' + static_configs: + - targets: [ '10.30.51.28:9100' ] + - targets: [ '10.30.51.29:9100' ] + - targets: [ '10.30.51.30:9100' ] + - targets: [ '10.30.51.32:9100' ] + - targets: [ '10.30.51.33:9100' ] + - targets: [ '10.30.51.34:9100' ] + - targets: [ '10.30.51.35:9100' ] + - targets: [ '10.30.51.39:9100' ] + - targets: [ '10.30.51.40:9100' ] + - targets: [ '10.30.51.50:9100' ] + - targets: [ '10.30.51.51:9100' ] + - targets: [ '10.30.51.65:9100' ] + - targets: [ '10.30.51.66:9100' ] + - targets: [ '10.30.51.67:9100' ] + - targets: [ '10.30.51.68:9100' ] + - targets: [ '10.30.51.70:9100' ] + - targets: [ '10.30.51.71:9100' ] + - targets: [ '10.32.8.14:9100' ] + - targets: [ '10.32.8.15:9100' ] + - targets: [ '10.32.8.16:9100' ] + - targets: [ '10.32.8.17:9100' ] + + - job_name: 'Alertmanager' consul_sd_configs: - server: '{{ env "NOMAD_IP_prometheus" }}:8500' - services: [ 'cadvisorexporter' ] + services: [ 'alertmanager' ] - job_name: 'Grafana' consul_sd_configs: - server: '{{ env "NOMAD_IP_prometheus" }}:8500' services: [ 'grafana' ] - - job_name: 'Node Exporter' - consul_sd_configs: - - server: '{{ env "NOMAD_IP_prometheus" }}:8500' - services: [ 'nodeexporter' ] - - job_name: 'Prometheus' consul_sd_configs: - server: '{{ env "NOMAD_IP_prometheus" }}:8500' diff --git a/terraform-ci-infra/1n_nmd/terraform.tfstate b/terraform-ci-infra/1n_nmd/terraform.tfstate index 03f0f0a075..d60f10206f 100644 --- a/terraform-ci-infra/1n_nmd/terraform.tfstate +++ b/terraform-ci-infra/1n_nmd/terraform.tfstate @@ -1,7 +1,7 @@ { "version": 4, "terraform_version": "0.14.5", - "serial": 1034, + "serial": 1042, "lineage": "e4e7f30a-652d-7a31-e31c-5e3a3388c9b9", "outputs": {}, "resources": [ @@ -16,9 +16,9 @@ "schema_version": 0, "attributes": { "filename": null, - "id": "ad663ce4420bb5e99efc3b23d4038bf0538eeed5e73135d34e8104aaba94fce4", - "rendered": "job \"prod-alertmanager\" {\n # The \"region\" parameter specifies the region in which to execute the job.\n # If omitted, this inherits the default region name of \"global\".\n # region = \"global\"\n #\n # The \"datacenters\" parameter specifies the list of datacenters which should\n # be considered when placing this task. This must be provided.\n datacenters = \"yul1\"\n\n # The \"type\" parameter controls the type of job, which impacts the scheduler's\n # decision on placement. This configuration is optional and defaults to\n # \"service\". For a full list of job types and their differences, please see\n # the online documentation.\n #\n # For more information, please see the online documentation at:\n #\n # https://www.nomadproject.io/docs/jobspec/schedulers\n #\n type = \"service\"\n\n update {\n # The \"max_parallel\" parameter specifies the maximum number of updates to\n # perform in parallel. In this case, this specifies to update a single task\n # at a time.\n max_parallel = 1\n\n health_check = \"checks\"\n\n # The \"min_healthy_time\" parameter specifies the minimum time the allocation\n # must be in the healthy state before it is marked as healthy and unblocks\n # further allocations from being updated.\n min_healthy_time = \"10s\"\n\n # The \"healthy_deadline\" parameter specifies the deadline in which the\n # allocation must be marked as healthy after which the allocation is\n # automatically transitioned to unhealthy. Transitioning to unhealthy will\n # fail the deployment and potentially roll back the job if \"auto_revert\" is\n # set to true.\n healthy_deadline = \"3m\"\n\n # The \"progress_deadline\" parameter specifies the deadline in which an\n # allocation must be marked as healthy. The deadline begins when the first\n # allocation for the deployment is created and is reset whenever an allocation\n # as part of the deployment transitions to a healthy state. If no allocation\n # transitions to the healthy state before the progress deadline, the\n # deployment is marked as failed.\n progress_deadline = \"10m\"\n\n\n # The \"canary\" parameter specifies that changes to the job that would result\n # in destructive updates should create the specified number of canaries\n # without stopping any previous allocations. Once the operator determines the\n # canaries are healthy, they can be promoted which unblocks a rolling update\n # of the remaining allocations at a rate of \"max_parallel\".\n #\n # Further, setting \"canary\" equal to the count of the task group allows\n # blue/green deployments. When the job is updated, a full set of the new\n # version is deployed and upon promotion the old version is stopped.\n canary = 1\n\n # Specifies if the job should auto-promote to the canary version when all\n # canaries become healthy during a deployment. Defaults to false which means\n # canaries must be manually updated with the nomad deployment promote\n # command.\n auto_promote = true\n\n # The \"auto_revert\" parameter specifies if the job should auto-revert to the\n # last stable job on deployment failure. A job is marked as stable if all the\n # allocations as part of its deployment were marked healthy.\n auto_revert = true\n\n }\n\n # The \"group\" stanza defines a series of tasks that should be co-located on\n # the same Nomad client. Any task within a group will be placed on the same\n # client.\n #\n # For more information and examples on the \"group\" stanza, please see\n # the online documentation at:\n #\n # https://www.nomadproject.io/docs/job-specification/group\n #\n group \"prod-group1-alertmanager\" {\n # The \"count\" parameter specifies the number of the task groups that should\n # be running under this group. This value must be non-negative and defaults\n # to 1.\n count = 1\n\n # The constraint allows restricting the set of eligible nodes. Constraints\n # may filter on attributes or client metadata.\n #\n # For more information and examples on the \"volume\" stanza, please see\n # the online documentation at:\n #\n # https://www.nomadproject.io/docs/job-specification/constraint\n #\n constraint {\n attribute = \"${attr.cpu.arch}\"\n operator = \"!=\"\n value = \"arm64\"\n }\n\n # The \"task\" stanza creates an individual unit of work, such as a Docker\n # container, web application, or batch processing.\n #\n # For more information and examples on the \"task\" stanza, please see\n # the online documentation at:\n #\n # https://www.nomadproject.io/docs/job-specification/task\n #\n task \"prod-task1-alertmanager\" {\n # The \"driver\" parameter specifies the task driver that should be used to\n # run the task.\n driver = \"exec\"\n\n \n\n # The \"config\" stanza specifies the driver configuration, which is passed\n # directly to the driver to start the task. The details of configurations\n # are specific to each driver, so please see specific driver\n # documentation for more information.\n config {\n command = \"local/alertmanager-0.21.0.linux-amd64/alertmanager\"\n args = [\n \"--config.file=secrets/alertmanager.yml\"\n ]\n }\n\n # The artifact stanza instructs Nomad to fetch and unpack a remote resource,\n # such as a file, tarball, or binary. Nomad downloads artifacts using the\n # popular go-getter library, which permits downloading artifacts from a\n # variety of locations using a URL as the input source.\n #\n # For more information and examples on the \"artifact\" stanza, please see\n # the online documentation at:\n #\n # https://www.nomadproject.io/docs/job-specification/artifact\n #\n artifact {\n source = \"https://github.com/prometheus/alertmanager/releases/download/v0.21.0/alertmanager-0.21.0.linux-amd64.tar.gz\"\n }\n\n # The \"template\" stanza instructs Nomad to manage a template, such as\n # a configuration file or script. This template can optionally pull data\n # from Consul or Vault to populate runtime configuration data.\n #\n # For more information and examples on the \"template\" stanza, please see\n # the online documentation at:\n #\n # https://www.nomadproject.io/docs/job-specification/template\n #\n template {\n change_mode = \"noop\"\n change_signal = \"SIGINT\"\n destination = \"secrets/alertmanager.yml\"\n left_delimiter = \"{{{\"\n right_delimiter = \"}}}\"\n data = \u003c\u003cEOH\nglobal:\n # The API URL to use for Slack notifications.\n slack_api_url: 'https://hooks.slack.com/services/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'\n\n# The directory from which notification templates are read.\ntemplates:\n- '/etc/alertmanager/template/*.tmpl'\n\n#tls_config:\n# # CA certificate to validate the server certificate with.\n# ca_file: \u003cfilepath\u003e ]\n#\n# # Certificate and key files for client cert authentication to the server.\n# cert_file: \u003cfilepath\u003e\n# key_file: \u003cfilepath\u003e\n#\n# # ServerName extension to indicate the name of the server.\n# # http://tools.ietf.org/html/rfc4366#section-3.1\n# server_name: \u003cstring\u003e\n#\n# # Disable validation of the server certificate.\n# insecure_skip_verify: true\n\n# The root route on which each incoming alert enters.\nroute:\n receiver: 'default-receiver'\n\n # The labels by which incoming alerts are grouped together. For example,\n # multiple alerts coming in for cluster=A and alertname=LatencyHigh would\n # be batched into a single group.\n #\n # To aggregate by all possible labels use '...' as the sole label name.\n # This effectively disables aggregation entirely, passing through all\n # alerts as-is. This is unlikely to be what you want, unless you have\n # a very low alert volume or your upstream notification system performs\n # its own grouping. Example: group_by: [...]\n group_by: ['alertname', 'cluster', 'service']\n\n # When a new group of alerts is created by an incoming alert, wait at\n # least 'group_wait' to send the initial notification.\n # This way ensures that you get multiple alerts for the same group that start\n # firing shortly after another are batched together on the first\n # notification.\n group_wait: 30s\n\n # When the first notification was sent, wait 'group_interval' to send a batch\n # of new alerts that started firing for that group.\n group_interval: 5m\n\n # If an alert has successfully been sent, wait 'repeat_interval' to\n # resend them.\n repeat_interval: 3h\n\n # All the above attributes are inherited by all child routes and can\n # overwritten on each.\n # The child route trees.\n routes:\n # This routes performs a regular expression match on alert labels to\n # catch alerts that are related to a list of services.\n - match_re:\n service: .*\n receiver: default-receiver\n # The service has a sub-route for critical alerts, any alerts\n # that do not match, i.e. severity != critical, fall-back to the\n # parent node and are sent to 'team-X-mails'\n routes:\n - match:\n severity: critical\n receiver: 'default-receiver'\n\n# Inhibition rules allow to mute a set of alerts given that another alert is\n# firing.\n# We use this to mute any warning-level notifications if the same alert is\n# already critical.\ninhibit_rules:\n- source_match:\n severity: 'critical'\n target_match:\n severity: 'warning'\n # Apply inhibition if the alertname is the same.\n # CAUTION:\n # If all label names listed in `equal` are missing\n # from both the source and target alerts,\n # the inhibition rule will apply!\n equal: ['alertname', 'cluster', 'service']\n\nreceivers:\n- name: 'default-receiver'\n slack_configs:\n - channel: '#fdio-infra-monitoring'\n send_resolved: true\n icon_url: https://avatars3.githubusercontent.com/u/3380462\n title: |-\n [{{ .Status | toUpper }}{{ if eq .Status \"firing\" }}:{{ .Alerts.Firing | len }}{{ end }}] {{ .CommonLabels.alertname }} for {{ .CommonLabels.job }}\n {{- if gt (len .CommonLabels) (len .GroupLabels) -}}\n {{\" \"}}(\n {{- with .CommonLabels.Remove .GroupLabels.Names }}\n {{- range $index, $label := .SortedPairs -}}\n {{ if $index }}, {{ end }}\n {{- $label.Name }}=\"{{ $label.Value -}}\"\n {{- end }}\n {{- end -}}\n )\n {{- end }}\n text: \u003e-\n {{ range .Alerts -}}\n *Alert:* {{ .Annotations.summary }}{{ if .Labels.severity }} - `{{ .Labels.severity }}`{{ end }}\n\n *Description:* {{ .Annotations.description }}\n\n *Details:*\n {{ range .Labels.SortedPairs }} • *{{ .Name }}:* `{{ .Value }}`\n {{ end }}\n {{ end }}\nEOH\n }\n\n # The service stanza instructs Nomad to register a service with Consul.\n #\n # For more information and examples on the \"task\" stanza, please see\n # the online documentation at:\n #\n # https://www.nomadproject.io/docs/job-specification/service\n #\n service {\n name = \"alertmanager\"\n port = \"alertmanager\"\n tags = [ \"alertmanager${NOMAD_ALLOC_INDEX}\" ]\n check {\n name = \"Alertmanager Check Live\"\n type = \"http\"\n path = \"/-/healthy\"\n interval = \"10s\"\n timeout = \"2s\"\n }\n }\n\n # The \"resources\" stanza describes the requirements a task needs to\n # execute. Resource requirements include memory, network, cpu, and more.\n # This ensures the task will execute on a machine that contains enough\n # resource capacity.\n #\n # For more information and examples on the \"resources\" stanza, please see\n # the online documentation at:\n #\n # https://www.nomadproject.io/docs/job-specification/resources\n #\n resources {\n cpu = 1000\n memory = 1024\n # The network stanza specifies the networking requirements for the task\n # group, including the network mode and port allocations. When scheduling\n # jobs in Nomad they are provisioned across your fleet of machines along\n # with other jobs and services. Because you don't know in advance what host\n # your job will be provisioned on, Nomad will provide your tasks with\n # network configuration when they start up.\n #\n # For more information and examples on the \"template\" stanza, please see\n # the online documentation at:\n #\n # https://www.nomadproject.io/docs/job-specification/network\n #\n network {\n port \"alertmanager\" {\n static = 9093\n }\n }\n }\n }\n }\n}", - "template": "job \"${job_name}\" {\n # The \"region\" parameter specifies the region in which to execute the job.\n # If omitted, this inherits the default region name of \"global\".\n # region = \"global\"\n #\n # The \"datacenters\" parameter specifies the list of datacenters which should\n # be considered when placing this task. This must be provided.\n datacenters = \"${datacenters}\"\n\n # The \"type\" parameter controls the type of job, which impacts the scheduler's\n # decision on placement. This configuration is optional and defaults to\n # \"service\". For a full list of job types and their differences, please see\n # the online documentation.\n #\n # For more information, please see the online documentation at:\n #\n # https://www.nomadproject.io/docs/jobspec/schedulers\n #\n type = \"service\"\n\n update {\n # The \"max_parallel\" parameter specifies the maximum number of updates to\n # perform in parallel. In this case, this specifies to update a single task\n # at a time.\n max_parallel = 1\n\n health_check = \"checks\"\n\n # The \"min_healthy_time\" parameter specifies the minimum time the allocation\n # must be in the healthy state before it is marked as healthy and unblocks\n # further allocations from being updated.\n min_healthy_time = \"10s\"\n\n # The \"healthy_deadline\" parameter specifies the deadline in which the\n # allocation must be marked as healthy after which the allocation is\n # automatically transitioned to unhealthy. Transitioning to unhealthy will\n # fail the deployment and potentially roll back the job if \"auto_revert\" is\n # set to true.\n healthy_deadline = \"3m\"\n\n # The \"progress_deadline\" parameter specifies the deadline in which an\n # allocation must be marked as healthy. The deadline begins when the first\n # allocation for the deployment is created and is reset whenever an allocation\n # as part of the deployment transitions to a healthy state. If no allocation\n # transitions to the healthy state before the progress deadline, the\n # deployment is marked as failed.\n progress_deadline = \"10m\"\n\n%{ if use_canary }\n # The \"canary\" parameter specifies that changes to the job that would result\n # in destructive updates should create the specified number of canaries\n # without stopping any previous allocations. Once the operator determines the\n # canaries are healthy, they can be promoted which unblocks a rolling update\n # of the remaining allocations at a rate of \"max_parallel\".\n #\n # Further, setting \"canary\" equal to the count of the task group allows\n # blue/green deployments. When the job is updated, a full set of the new\n # version is deployed and upon promotion the old version is stopped.\n canary = 1\n\n # Specifies if the job should auto-promote to the canary version when all\n # canaries become healthy during a deployment. Defaults to false which means\n # canaries must be manually updated with the nomad deployment promote\n # command.\n auto_promote = true\n\n # The \"auto_revert\" parameter specifies if the job should auto-revert to the\n # last stable job on deployment failure. A job is marked as stable if all the\n # allocations as part of its deployment were marked healthy.\n auto_revert = true\n%{ endif }\n }\n\n # The \"group\" stanza defines a series of tasks that should be co-located on\n # the same Nomad client. Any task within a group will be placed on the same\n # client.\n #\n # For more information and examples on the \"group\" stanza, please see\n # the online documentation at:\n #\n # https://www.nomadproject.io/docs/job-specification/group\n #\n group \"prod-group1-${service_name}\" {\n # The \"count\" parameter specifies the number of the task groups that should\n # be running under this group. This value must be non-negative and defaults\n # to 1.\n count = ${group_count}\n\n # The constraint allows restricting the set of eligible nodes. Constraints\n # may filter on attributes or client metadata.\n #\n # For more information and examples on the \"volume\" stanza, please see\n # the online documentation at:\n #\n # https://www.nomadproject.io/docs/job-specification/constraint\n #\n constraint {\n attribute = \"$${attr.cpu.arch}\"\n operator = \"!=\"\n value = \"arm64\"\n }\n\n # The \"task\" stanza creates an individual unit of work, such as a Docker\n # container, web application, or batch processing.\n #\n # For more information and examples on the \"task\" stanza, please see\n # the online documentation at:\n #\n # https://www.nomadproject.io/docs/job-specification/task\n #\n task \"prod-task1-${service_name}\" {\n # The \"driver\" parameter specifies the task driver that should be used to\n # run the task.\n driver = \"exec\"\n\n %{ if use_vault_provider }\n vault {\n policies = \"${vault_kv_policy_name}\"\n }\n %{ endif }\n\n # The \"config\" stanza specifies the driver configuration, which is passed\n # directly to the driver to start the task. The details of configurations\n # are specific to each driver, so please see specific driver\n # documentation for more information.\n config {\n command = \"local/alertmanager-${version}.linux-amd64/alertmanager\"\n args = [\n \"--config.file=secrets/alertmanager.yml\"\n ]\n }\n\n # The artifact stanza instructs Nomad to fetch and unpack a remote resource,\n # such as a file, tarball, or binary. Nomad downloads artifacts using the\n # popular go-getter library, which permits downloading artifacts from a\n # variety of locations using a URL as the input source.\n #\n # For more information and examples on the \"artifact\" stanza, please see\n # the online documentation at:\n #\n # https://www.nomadproject.io/docs/job-specification/artifact\n #\n artifact {\n source = \"${url}\"\n }\n\n # The \"template\" stanza instructs Nomad to manage a template, such as\n # a configuration file or script. This template can optionally pull data\n # from Consul or Vault to populate runtime configuration data.\n #\n # For more information and examples on the \"template\" stanza, please see\n # the online documentation at:\n #\n # https://www.nomadproject.io/docs/job-specification/template\n #\n template {\n change_mode = \"noop\"\n change_signal = \"SIGINT\"\n destination = \"secrets/alertmanager.yml\"\n left_delimiter = \"{{{\"\n right_delimiter = \"}}}\"\n data = \u003c\u003cEOH\nglobal:\n # The API URL to use for Slack notifications.\n slack_api_url: 'https://hooks.slack.com/services/${slack_api_key}'\n\n# The directory from which notification templates are read.\ntemplates:\n- '/etc/alertmanager/template/*.tmpl'\n\n#tls_config:\n# # CA certificate to validate the server certificate with.\n# ca_file: \u003cfilepath\u003e ]\n#\n# # Certificate and key files for client cert authentication to the server.\n# cert_file: \u003cfilepath\u003e\n# key_file: \u003cfilepath\u003e\n#\n# # ServerName extension to indicate the name of the server.\n# # http://tools.ietf.org/html/rfc4366#section-3.1\n# server_name: \u003cstring\u003e\n#\n# # Disable validation of the server certificate.\n# insecure_skip_verify: true\n\n# The root route on which each incoming alert enters.\nroute:\n receiver: '${default_receiver}'\n\n # The labels by which incoming alerts are grouped together. For example,\n # multiple alerts coming in for cluster=A and alertname=LatencyHigh would\n # be batched into a single group.\n #\n # To aggregate by all possible labels use '...' as the sole label name.\n # This effectively disables aggregation entirely, passing through all\n # alerts as-is. This is unlikely to be what you want, unless you have\n # a very low alert volume or your upstream notification system performs\n # its own grouping. Example: group_by: [...]\n group_by: ['alertname', 'cluster', 'service']\n\n # When a new group of alerts is created by an incoming alert, wait at\n # least 'group_wait' to send the initial notification.\n # This way ensures that you get multiple alerts for the same group that start\n # firing shortly after another are batched together on the first\n # notification.\n group_wait: 30s\n\n # When the first notification was sent, wait 'group_interval' to send a batch\n # of new alerts that started firing for that group.\n group_interval: 5m\n\n # If an alert has successfully been sent, wait 'repeat_interval' to\n # resend them.\n repeat_interval: 3h\n\n # All the above attributes are inherited by all child routes and can\n # overwritten on each.\n # The child route trees.\n routes:\n # This routes performs a regular expression match on alert labels to\n # catch alerts that are related to a list of services.\n - match_re:\n service: .*\n receiver: ${default_receiver}\n # The service has a sub-route for critical alerts, any alerts\n # that do not match, i.e. severity != critical, fall-back to the\n # parent node and are sent to 'team-X-mails'\n routes:\n - match:\n severity: critical\n receiver: '${default_receiver}'\n\n# Inhibition rules allow to mute a set of alerts given that another alert is\n# firing.\n# We use this to mute any warning-level notifications if the same alert is\n# already critical.\ninhibit_rules:\n- source_match:\n severity: 'critical'\n target_match:\n severity: 'warning'\n # Apply inhibition if the alertname is the same.\n # CAUTION:\n # If all label names listed in `equal` are missing\n # from both the source and target alerts,\n # the inhibition rule will apply!\n equal: ['alertname', 'cluster', 'service']\n\nreceivers:\n- name: '${default_receiver}'\n slack_configs:\n - channel: '#${slack_channel}'\n send_resolved: true\n icon_url: https://avatars3.githubusercontent.com/u/3380462\n title: |-\n [{{ .Status | toUpper }}{{ if eq .Status \"firing\" }}:{{ .Alerts.Firing | len }}{{ end }}] {{ .CommonLabels.alertname }} for {{ .CommonLabels.job }}\n {{- if gt (len .CommonLabels) (len .GroupLabels) -}}\n {{\" \"}}(\n {{- with .CommonLabels.Remove .GroupLabels.Names }}\n {{- range $index, $label := .SortedPairs -}}\n {{ if $index }}, {{ end }}\n {{- $label.Name }}=\"{{ $label.Value -}}\"\n {{- end }}\n {{- end -}}\n )\n {{- end }}\n text: \u003e-\n {{ range .Alerts -}}\n *Alert:* {{ .Annotations.summary }}{{ if .Labels.severity }} - `{{ .Labels.severity }}`{{ end }}\n\n *Description:* {{ .Annotations.description }}\n\n *Details:*\n {{ range .Labels.SortedPairs }} • *{{ .Name }}:* `{{ .Value }}`\n {{ end }}\n {{ end }}\nEOH\n }\n\n # The service stanza instructs Nomad to register a service with Consul.\n #\n # For more information and examples on the \"task\" stanza, please see\n # the online documentation at:\n #\n # https://www.nomadproject.io/docs/job-specification/service\n #\n service {\n name = \"${service_name}\"\n port = \"${service_name}\"\n tags = [ \"${service_name}$${NOMAD_ALLOC_INDEX}\" ]\n check {\n name = \"Alertmanager Check Live\"\n type = \"http\"\n path = \"/-/healthy\"\n interval = \"10s\"\n timeout = \"2s\"\n }\n }\n\n # The \"resources\" stanza describes the requirements a task needs to\n # execute. Resource requirements include memory, network, cpu, and more.\n # This ensures the task will execute on a machine that contains enough\n # resource capacity.\n #\n # For more information and examples on the \"resources\" stanza, please see\n # the online documentation at:\n #\n # https://www.nomadproject.io/docs/job-specification/resources\n #\n resources {\n cpu = ${cpu}\n memory = ${mem}\n # The network stanza specifies the networking requirements for the task\n # group, including the network mode and port allocations. When scheduling\n # jobs in Nomad they are provisioned across your fleet of machines along\n # with other jobs and services. Because you don't know in advance what host\n # your job will be provisioned on, Nomad will provide your tasks with\n # network configuration when they start up.\n #\n # For more information and examples on the \"template\" stanza, please see\n # the online documentation at:\n #\n # https://www.nomadproject.io/docs/job-specification/network\n #\n network {\n port \"${service_name}\" {\n static = ${port}\n }\n }\n }\n }\n }\n}", + "id": "1ea874db11980359d80aada65912759b6acd9a7399f011d823a72eb7c5b4a329", + "rendered": "job \"prod-alertmanager\" {\n # The \"region\" parameter specifies the region in which to execute the job.\n # If omitted, this inherits the default region name of \"global\".\n # region = \"global\"\n #\n # The \"datacenters\" parameter specifies the list of datacenters which should\n # be considered when placing this task. This must be provided.\n datacenters = \"yul1\"\n\n # The \"type\" parameter controls the type of job, which impacts the scheduler's\n # decision on placement. This configuration is optional and defaults to\n # \"service\". For a full list of job types and their differences, please see\n # the online documentation.\n #\n # For more information, please see the online documentation at:\n #\n # https://www.nomadproject.io/docs/jobspec/schedulers\n #\n type = \"service\"\n\n update {\n # The \"max_parallel\" parameter specifies the maximum number of updates to\n # perform in parallel. In this case, this specifies to update a single task\n # at a time.\n max_parallel = 1\n\n health_check = \"checks\"\n\n # The \"min_healthy_time\" parameter specifies the minimum time the allocation\n # must be in the healthy state before it is marked as healthy and unblocks\n # further allocations from being updated.\n min_healthy_time = \"10s\"\n\n # The \"healthy_deadline\" parameter specifies the deadline in which the\n # allocation must be marked as healthy after which the allocation is\n # automatically transitioned to unhealthy. Transitioning to unhealthy will\n # fail the deployment and potentially roll back the job if \"auto_revert\" is\n # set to true.\n healthy_deadline = \"3m\"\n\n # The \"progress_deadline\" parameter specifies the deadline in which an\n # allocation must be marked as healthy. The deadline begins when the first\n # allocation for the deployment is created and is reset whenever an allocation\n # as part of the deployment transitions to a healthy state. If no allocation\n # transitions to the healthy state before the progress deadline, the\n # deployment is marked as failed.\n progress_deadline = \"10m\"\n\n\n # The \"canary\" parameter specifies that changes to the job that would result\n # in destructive updates should create the specified number of canaries\n # without stopping any previous allocations. Once the operator determines the\n # canaries are healthy, they can be promoted which unblocks a rolling update\n # of the remaining allocations at a rate of \"max_parallel\".\n #\n # Further, setting \"canary\" equal to the count of the task group allows\n # blue/green deployments. When the job is updated, a full set of the new\n # version is deployed and upon promotion the old version is stopped.\n canary = 1\n\n # Specifies if the job should auto-promote to the canary version when all\n # canaries become healthy during a deployment. Defaults to false which means\n # canaries must be manually updated with the nomad deployment promote\n # command.\n auto_promote = true\n\n # The \"auto_revert\" parameter specifies if the job should auto-revert to the\n # last stable job on deployment failure. A job is marked as stable if all the\n # allocations as part of its deployment were marked healthy.\n auto_revert = true\n\n }\n\n # The \"group\" stanza defines a series of tasks that should be co-located on\n # the same Nomad client. Any task within a group will be placed on the same\n # client.\n #\n # For more information and examples on the \"group\" stanza, please see\n # the online documentation at:\n #\n # https://www.nomadproject.io/docs/job-specification/group\n #\n group \"prod-group1-alertmanager\" {\n # The \"count\" parameter specifies the number of the task groups that should\n # be running under this group. This value must be non-negative and defaults\n # to 1.\n count = 1\n\n # The constraint allows restricting the set of eligible nodes. Constraints\n # may filter on attributes or client metadata.\n #\n # For more information and examples on the \"volume\" stanza, please see\n # the online documentation at:\n #\n # https://www.nomadproject.io/docs/job-specification/constraint\n #\n constraint {\n attribute = \"${attr.cpu.arch}\"\n operator = \"!=\"\n value = \"arm64\"\n }\n\n # The \"task\" stanza creates an individual unit of work, such as a Docker\n # container, web application, or batch processing.\n #\n # For more information and examples on the \"task\" stanza, please see\n # the online documentation at:\n #\n # https://www.nomadproject.io/docs/job-specification/task\n #\n task \"prod-task1-alertmanager\" {\n # The \"driver\" parameter specifies the task driver that should be used to\n # run the task.\n driver = \"exec\"\n\n \n\n # The \"config\" stanza specifies the driver configuration, which is passed\n # directly to the driver to start the task. The details of configurations\n # are specific to each driver, so please see specific driver\n # documentation for more information.\n config {\n command = \"local/alertmanager-0.21.0.linux-amd64/alertmanager\"\n args = [\n \"--config.file=secrets/alertmanager.yml\"\n ]\n }\n\n # The artifact stanza instructs Nomad to fetch and unpack a remote resource,\n # such as a file, tarball, or binary. Nomad downloads artifacts using the\n # popular go-getter library, which permits downloading artifacts from a\n # variety of locations using a URL as the input source.\n #\n # For more information and examples on the \"artifact\" stanza, please see\n # the online documentation at:\n #\n # https://www.nomadproject.io/docs/job-specification/artifact\n #\n artifact {\n source = \"https://github.com/prometheus/alertmanager/releases/download/v0.21.0/alertmanager-0.21.0.linux-amd64.tar.gz\"\n }\n\n # The \"template\" stanza instructs Nomad to manage a template, such as\n # a configuration file or script. This template can optionally pull data\n # from Consul or Vault to populate runtime configuration data.\n #\n # For more information and examples on the \"template\" stanza, please see\n # the online documentation at:\n #\n # https://www.nomadproject.io/docs/job-specification/template\n #\n template {\n change_mode = \"noop\"\n change_signal = \"SIGINT\"\n destination = \"secrets/alertmanager.yml\"\n left_delimiter = \"{{{\"\n right_delimiter = \"}}}\"\n data = \u003c\u003cEOH\nglobal:\n # The API URL to use for Slack notifications.\n slack_api_url: 'https://hooks.slack.com/services/XX'\n\n# The directory from which notification templates are read.\ntemplates:\n- '/etc/alertmanager/template/*.tmpl'\n\n#tls_config:\n# # CA certificate to validate the server certificate with.\n# ca_file: \u003cfilepath\u003e ]\n#\n# # Certificate and key files for client cert authentication to the server.\n# cert_file: \u003cfilepath\u003e\n# key_file: \u003cfilepath\u003e\n#\n# # ServerName extension to indicate the name of the server.\n# # http://tools.ietf.org/html/rfc4366#section-3.1\n# server_name: \u003cstring\u003e\n#\n# # Disable validation of the server certificate.\n# insecure_skip_verify: true\n\n# The root route on which each incoming alert enters.\nroute:\n receiver: 'default-receiver'\n\n # The labels by which incoming alerts are grouped together. For example,\n # multiple alerts coming in for cluster=A and alertname=LatencyHigh would\n # be batched into a single group.\n #\n # To aggregate by all possible labels use '...' as the sole label name.\n # This effectively disables aggregation entirely, passing through all\n # alerts as-is. This is unlikely to be what you want, unless you have\n # a very low alert volume or your upstream notification system performs\n # its own grouping. Example: group_by: [...]\n group_by: ['alertname']\n\n # When a new group of alerts is created by an incoming alert, wait at\n # least 'group_wait' to send the initial notification.\n # This way ensures that you get multiple alerts for the same group that start\n # firing shortly after another are batched together on the first\n # notification.\n group_wait: 30s\n\n # When the first notification was sent, wait 'group_interval' to send a batch\n # of new alerts that started firing for that group.\n group_interval: 5m\n\n # If an alert has successfully been sent, wait 'repeat_interval' to\n # resend them.\n repeat_interval: 3h\n\n # All the above attributes are inherited by all child routes and can\n # overwritten on each.\n # The child route trees.\n routes:\n # This routes performs a regular expression match on alert labels to\n # catch alerts that are related to a list of services.\n - match_re:\n service: .*\n receiver: default-receiver\n # The service has a sub-route for critical alerts, any alerts\n # that do not match, i.e. severity != critical, fall-back to the\n # parent node and are sent to 'team-X-mails'\n routes:\n - match:\n severity: critical\n receiver: 'default-receiver'\n\n# Inhibition rules allow to mute a set of alerts given that another alert is\n# firing.\n# We use this to mute any warning-level notifications if the same alert is\n# already critical.\ninhibit_rules:\n- source_match:\n severity: 'critical'\n target_match:\n severity: 'warning'\n # Apply inhibition if the alertname is the same.\n # CAUTION:\n # If all label names listed in `equal` are missing\n # from both the source and target alerts,\n # the inhibition rule will apply!\n equal: ['alertname', 'cluster', 'service']\n\nreceivers:\n- name: 'default-receiver'\n slack_configs:\n - channel: '#fdio-infra-monitoring'\n send_resolved: true\n icon_url: https://avatars3.githubusercontent.com/u/3380462\n title: |-\n [{{ .Status | toUpper }}{{ if eq .Status \"firing\" }}:{{ .Alerts.Firing | len }}{{ end }}] {{ .CommonLabels.alertname }} for {{ .CommonLabels.job }}\n {{- if gt (len .CommonLabels) (len .GroupLabels) -}}\n {{\" \"}}(\n {{- with .CommonLabels.Remove .GroupLabels.Names }}\n {{- range $index, $label := .SortedPairs -}}\n {{ if $index }}, {{ end }}\n {{- $label.Name }}=\"{{ $label.Value -}}\"\n {{- end }}\n {{- end -}}\n )\n {{- end }}\n text: \u003e-\n {{ range .Alerts -}}\n *Alert:* {{ .Annotations.summary }}{{ if .Labels.severity }} - `{{ .Labels.severity }}`{{ end }}\n\n *Description:* {{ .Annotations.description }}\n\n *Details:*\n {{ range .Labels.SortedPairs }} • *{{ .Name }}:* `{{ .Value }}`\n {{ end }}\n {{ end }}\nEOH\n }\n\n # The service stanza instructs Nomad to register a service with Consul.\n #\n # For more information and examples on the \"task\" stanza, please see\n # the online documentation at:\n #\n # https://www.nomadproject.io/docs/job-specification/service\n #\n service {\n name = \"alertmanager\"\n port = \"alertmanager\"\n tags = [ \"alertmanager${NOMAD_ALLOC_INDEX}\" ]\n check {\n name = \"Alertmanager Check Live\"\n type = \"http\"\n path = \"/-/healthy\"\n interval = \"10s\"\n timeout = \"2s\"\n }\n }\n\n # The \"resources\" stanza describes the requirements a task needs to\n # execute. Resource requirements include memory, network, cpu, and more.\n # This ensures the task will execute on a machine that contains enough\n # resource capacity.\n #\n # For more information and examples on the \"resources\" stanza, please see\n # the online documentation at:\n #\n # https://www.nomadproject.io/docs/job-specification/resources\n #\n resources {\n cpu = 1000\n memory = 1024\n # The network stanza specifies the networking requirements for the task\n # group, including the network mode and port allocations. When scheduling\n # jobs in Nomad they are provisioned across your fleet of machines along\n # with other jobs and services. Because you don't know in advance what host\n # your job will be provisioned on, Nomad will provide your tasks with\n # network configuration when they start up.\n #\n # For more information and examples on the \"template\" stanza, please see\n # the online documentation at:\n #\n # https://www.nomadproject.io/docs/job-specification/network\n #\n network {\n port \"alertmanager\" {\n static = 9093\n }\n }\n }\n }\n }\n}", + "template": "job \"${job_name}\" {\n # The \"region\" parameter specifies the region in which to execute the job.\n # If omitted, this inherits the default region name of \"global\".\n # region = \"global\"\n #\n # The \"datacenters\" parameter specifies the list of datacenters which should\n # be considered when placing this task. This must be provided.\n datacenters = \"${datacenters}\"\n\n # The \"type\" parameter controls the type of job, which impacts the scheduler's\n # decision on placement. This configuration is optional and defaults to\n # \"service\". For a full list of job types and their differences, please see\n # the online documentation.\n #\n # For more information, please see the online documentation at:\n #\n # https://www.nomadproject.io/docs/jobspec/schedulers\n #\n type = \"service\"\n\n update {\n # The \"max_parallel\" parameter specifies the maximum number of updates to\n # perform in parallel. In this case, this specifies to update a single task\n # at a time.\n max_parallel = 1\n\n health_check = \"checks\"\n\n # The \"min_healthy_time\" parameter specifies the minimum time the allocation\n # must be in the healthy state before it is marked as healthy and unblocks\n # further allocations from being updated.\n min_healthy_time = \"10s\"\n\n # The \"healthy_deadline\" parameter specifies the deadline in which the\n # allocation must be marked as healthy after which the allocation is\n # automatically transitioned to unhealthy. Transitioning to unhealthy will\n # fail the deployment and potentially roll back the job if \"auto_revert\" is\n # set to true.\n healthy_deadline = \"3m\"\n\n # The \"progress_deadline\" parameter specifies the deadline in which an\n # allocation must be marked as healthy. The deadline begins when the first\n # allocation for the deployment is created and is reset whenever an allocation\n # as part of the deployment transitions to a healthy state. If no allocation\n # transitions to the healthy state before the progress deadline, the\n # deployment is marked as failed.\n progress_deadline = \"10m\"\n\n%{ if use_canary }\n # The \"canary\" parameter specifies that changes to the job that would result\n # in destructive updates should create the specified number of canaries\n # without stopping any previous allocations. Once the operator determines the\n # canaries are healthy, they can be promoted which unblocks a rolling update\n # of the remaining allocations at a rate of \"max_parallel\".\n #\n # Further, setting \"canary\" equal to the count of the task group allows\n # blue/green deployments. When the job is updated, a full set of the new\n # version is deployed and upon promotion the old version is stopped.\n canary = 1\n\n # Specifies if the job should auto-promote to the canary version when all\n # canaries become healthy during a deployment. Defaults to false which means\n # canaries must be manually updated with the nomad deployment promote\n # command.\n auto_promote = true\n\n # The \"auto_revert\" parameter specifies if the job should auto-revert to the\n # last stable job on deployment failure. A job is marked as stable if all the\n # allocations as part of its deployment were marked healthy.\n auto_revert = true\n%{ endif }\n }\n\n # The \"group\" stanza defines a series of tasks that should be co-located on\n # the same Nomad client. Any task within a group will be placed on the same\n # client.\n #\n # For more information and examples on the \"group\" stanza, please see\n # the online documentation at:\n #\n # https://www.nomadproject.io/docs/job-specification/group\n #\n group \"prod-group1-${service_name}\" {\n # The \"count\" parameter specifies the number of the task groups that should\n # be running under this group. This value must be non-negative and defaults\n # to 1.\n count = ${group_count}\n\n # The constraint allows restricting the set of eligible nodes. Constraints\n # may filter on attributes or client metadata.\n #\n # For more information and examples on the \"volume\" stanza, please see\n # the online documentation at:\n #\n # https://www.nomadproject.io/docs/job-specification/constraint\n #\n constraint {\n attribute = \"$${attr.cpu.arch}\"\n operator = \"!=\"\n value = \"arm64\"\n }\n\n # The \"task\" stanza creates an individual unit of work, such as a Docker\n # container, web application, or batch processing.\n #\n # For more information and examples on the \"task\" stanza, please see\n # the online documentation at:\n #\n # https://www.nomadproject.io/docs/job-specification/task\n #\n task \"prod-task1-${service_name}\" {\n # The \"driver\" parameter specifies the task driver that should be used to\n # run the task.\n driver = \"exec\"\n\n %{ if use_vault_provider }\n vault {\n policies = \"${vault_kv_policy_name}\"\n }\n %{ endif }\n\n # The \"config\" stanza specifies the driver configuration, which is passed\n # directly to the driver to start the task. The details of configurations\n # are specific to each driver, so please see specific driver\n # documentation for more information.\n config {\n command = \"local/alertmanager-${version}.linux-amd64/alertmanager\"\n args = [\n \"--config.file=secrets/alertmanager.yml\"\n ]\n }\n\n # The artifact stanza instructs Nomad to fetch and unpack a remote resource,\n # such as a file, tarball, or binary. Nomad downloads artifacts using the\n # popular go-getter library, which permits downloading artifacts from a\n # variety of locations using a URL as the input source.\n #\n # For more information and examples on the \"artifact\" stanza, please see\n # the online documentation at:\n #\n # https://www.nomadproject.io/docs/job-specification/artifact\n #\n artifact {\n source = \"${url}\"\n }\n\n # The \"template\" stanza instructs Nomad to manage a template, such as\n # a configuration file or script. This template can optionally pull data\n # from Consul or Vault to populate runtime configuration data.\n #\n # For more information and examples on the \"template\" stanza, please see\n # the online documentation at:\n #\n # https://www.nomadproject.io/docs/job-specification/template\n #\n template {\n change_mode = \"noop\"\n change_signal = \"SIGINT\"\n destination = \"secrets/alertmanager.yml\"\n left_delimiter = \"{{{\"\n right_delimiter = \"}}}\"\n data = \u003c\u003cEOH\nglobal:\n # The API URL to use for Slack notifications.\n slack_api_url: 'https://hooks.slack.com/services/${slack_api_key}'\n\n# The directory from which notification templates are read.\ntemplates:\n- '/etc/alertmanager/template/*.tmpl'\n\n#tls_config:\n# # CA certificate to validate the server certificate with.\n# ca_file: \u003cfilepath\u003e ]\n#\n# # Certificate and key files for client cert authentication to the server.\n# cert_file: \u003cfilepath\u003e\n# key_file: \u003cfilepath\u003e\n#\n# # ServerName extension to indicate the name of the server.\n# # http://tools.ietf.org/html/rfc4366#section-3.1\n# server_name: \u003cstring\u003e\n#\n# # Disable validation of the server certificate.\n# insecure_skip_verify: true\n\n# The root route on which each incoming alert enters.\nroute:\n receiver: '${default_receiver}'\n\n # The labels by which incoming alerts are grouped together. For example,\n # multiple alerts coming in for cluster=A and alertname=LatencyHigh would\n # be batched into a single group.\n #\n # To aggregate by all possible labels use '...' as the sole label name.\n # This effectively disables aggregation entirely, passing through all\n # alerts as-is. This is unlikely to be what you want, unless you have\n # a very low alert volume or your upstream notification system performs\n # its own grouping. Example: group_by: [...]\n group_by: ['alertname']\n\n # When a new group of alerts is created by an incoming alert, wait at\n # least 'group_wait' to send the initial notification.\n # This way ensures that you get multiple alerts for the same group that start\n # firing shortly after another are batched together on the first\n # notification.\n group_wait: 30s\n\n # When the first notification was sent, wait 'group_interval' to send a batch\n # of new alerts that started firing for that group.\n group_interval: 5m\n\n # If an alert has successfully been sent, wait 'repeat_interval' to\n # resend them.\n repeat_interval: 3h\n\n # All the above attributes are inherited by all child routes and can\n # overwritten on each.\n # The child route trees.\n routes:\n # This routes performs a regular expression match on alert labels to\n # catch alerts that are related to a list of services.\n - match_re:\n service: .*\n receiver: ${default_receiver}\n # The service has a sub-route for critical alerts, any alerts\n # that do not match, i.e. severity != critical, fall-back to the\n # parent node and are sent to 'team-X-mails'\n routes:\n - match:\n severity: critical\n receiver: '${default_receiver}'\n\n# Inhibition rules allow to mute a set of alerts given that another alert is\n# firing.\n# We use this to mute any warning-level notifications if the same alert is\n# already critical.\ninhibit_rules:\n- source_match:\n severity: 'critical'\n target_match:\n severity: 'warning'\n # Apply inhibition if the alertname is the same.\n # CAUTION:\n # If all label names listed in `equal` are missing\n # from both the source and target alerts,\n # the inhibition rule will apply!\n equal: ['alertname', 'cluster', 'service']\n\nreceivers:\n- name: '${default_receiver}'\n slack_configs:\n - channel: '#${slack_channel}'\n send_resolved: true\n icon_url: https://avatars3.githubusercontent.com/u/3380462\n title: |-\n [{{ .Status | toUpper }}{{ if eq .Status \"firing\" }}:{{ .Alerts.Firing | len }}{{ end }}] {{ .CommonLabels.alertname }} for {{ .CommonLabels.job }}\n {{- if gt (len .CommonLabels) (len .GroupLabels) -}}\n {{\" \"}}(\n {{- with .CommonLabels.Remove .GroupLabels.Names }}\n {{- range $index, $label := .SortedPairs -}}\n {{ if $index }}, {{ end }}\n {{- $label.Name }}=\"{{ $label.Value -}}\"\n {{- end }}\n {{- end -}}\n )\n {{- end }}\n text: \u003e-\n {{ range .Alerts -}}\n *Alert:* {{ .Annotations.summary }}{{ if .Labels.severity }} - `{{ .Labels.severity }}`{{ end }}\n\n *Description:* {{ .Annotations.description }}\n\n *Details:*\n {{ range .Labels.SortedPairs }} • *{{ .Name }}:* `{{ .Value }}`\n {{ end }}\n {{ end }}\nEOH\n }\n\n # The service stanza instructs Nomad to register a service with Consul.\n #\n # For more information and examples on the \"task\" stanza, please see\n # the online documentation at:\n #\n # https://www.nomadproject.io/docs/job-specification/service\n #\n service {\n name = \"${service_name}\"\n port = \"${service_name}\"\n tags = [ \"${service_name}$${NOMAD_ALLOC_INDEX}\" ]\n check {\n name = \"Alertmanager Check Live\"\n type = \"http\"\n path = \"/-/healthy\"\n interval = \"10s\"\n timeout = \"2s\"\n }\n }\n\n # The \"resources\" stanza describes the requirements a task needs to\n # execute. Resource requirements include memory, network, cpu, and more.\n # This ensures the task will execute on a machine that contains enough\n # resource capacity.\n #\n # For more information and examples on the \"resources\" stanza, please see\n # the online documentation at:\n #\n # https://www.nomadproject.io/docs/job-specification/resources\n #\n resources {\n cpu = ${cpu}\n memory = ${mem}\n # The network stanza specifies the networking requirements for the task\n # group, including the network mode and port allocations. When scheduling\n # jobs in Nomad they are provisioned across your fleet of machines along\n # with other jobs and services. Because you don't know in advance what host\n # your job will be provisioned on, Nomad will provide your tasks with\n # network configuration when they start up.\n #\n # For more information and examples on the \"template\" stanza, please see\n # the online documentation at:\n #\n # https://www.nomadproject.io/docs/job-specification/network\n #\n network {\n port \"${service_name}\" {\n static = ${port}\n }\n }\n }\n }\n }\n}", "vars": { "cpu": "1000", "datacenters": "yul1", @@ -28,7 +28,7 @@ "mem": "1024", "port": "9093", "service_name": "alertmanager", - "slack_api_key": "TE07RD1V1/B01L7PQK9S8/xncEcMAvF0GtJpTbC30E0AyL", + "slack_api_key": "XX", "slack_channel": "fdio-infra-monitoring", "url": "https://github.com/prometheus/alertmanager/releases/download/v0.21.0/alertmanager-0.21.0.linux-amd64.tar.gz", "use_canary": "true", @@ -51,21 +51,21 @@ "schema_version": 0, "attributes": { "allocation_ids": [ - "a0e6d97f-c9fa-dba3-a58a-24540493931e", - "464144d9-8c27-dc03-71bb-6f8d525bc486" + "e0aa0e03-a425-2ec8-06ad-ccf89f0f4ee3", + "bf013613-1ac7-6155-69e0-51ff57719549" ], "datacenters": [ "yul1" ], - "deployment_id": "b4e00636-7610-9e21-7a46-0dc211c21abd", + "deployment_id": "d132f4bd-ef31-5406-97e6-8831ad4a3db5", "deployment_status": "successful", "deregister_on_destroy": true, "deregister_on_id_change": true, "detach": false, "id": "prod-alertmanager", - "jobspec": "job \"prod-alertmanager\" {\n # The \"region\" parameter specifies the region in which to execute the job.\n # If omitted, this inherits the default region name of \"global\".\n # region = \"global\"\n #\n # The \"datacenters\" parameter specifies the list of datacenters which should\n # be considered when placing this task. This must be provided.\n datacenters = \"yul1\"\n\n # The \"type\" parameter controls the type of job, which impacts the scheduler's\n # decision on placement. This configuration is optional and defaults to\n # \"service\". For a full list of job types and their differences, please see\n # the online documentation.\n #\n # For more information, please see the online documentation at:\n #\n # https://www.nomadproject.io/docs/jobspec/schedulers\n #\n type = \"service\"\n\n update {\n # The \"max_parallel\" parameter specifies the maximum number of updates to\n # perform in parallel. In this case, this specifies to update a single task\n # at a time.\n max_parallel = 1\n\n health_check = \"checks\"\n\n # The \"min_healthy_time\" parameter specifies the minimum time the allocation\n # must be in the healthy state before it is marked as healthy and unblocks\n # further allocations from being updated.\n min_healthy_time = \"10s\"\n\n # The \"healthy_deadline\" parameter specifies the deadline in which the\n # allocation must be marked as healthy after which the allocation is\n # automatically transitioned to unhealthy. Transitioning to unhealthy will\n # fail the deployment and potentially roll back the job if \"auto_revert\" is\n # set to true.\n healthy_deadline = \"3m\"\n\n # The \"progress_deadline\" parameter specifies the deadline in which an\n # allocation must be marked as healthy. The deadline begins when the first\n # allocation for the deployment is created and is reset whenever an allocation\n # as part of the deployment transitions to a healthy state. If no allocation\n # transitions to the healthy state before the progress deadline, the\n # deployment is marked as failed.\n progress_deadline = \"10m\"\n\n\n # The \"canary\" parameter specifies that changes to the job that would result\n # in destructive updates should create the specified number of canaries\n # without stopping any previous allocations. Once the operator determines the\n # canaries are healthy, they can be promoted which unblocks a rolling update\n # of the remaining allocations at a rate of \"max_parallel\".\n #\n # Further, setting \"canary\" equal to the count of the task group allows\n # blue/green deployments. When the job is updated, a full set of the new\n # version is deployed and upon promotion the old version is stopped.\n canary = 1\n\n # Specifies if the job should auto-promote to the canary version when all\n # canaries become healthy during a deployment. Defaults to false which means\n # canaries must be manually updated with the nomad deployment promote\n # command.\n auto_promote = true\n\n # The \"auto_revert\" parameter specifies if the job should auto-revert to the\n # last stable job on deployment failure. A job is marked as stable if all the\n # allocations as part of its deployment were marked healthy.\n auto_revert = true\n\n }\n\n # The \"group\" stanza defines a series of tasks that should be co-located on\n # the same Nomad client. Any task within a group will be placed on the same\n # client.\n #\n # For more information and examples on the \"group\" stanza, please see\n # the online documentation at:\n #\n # https://www.nomadproject.io/docs/job-specification/group\n #\n group \"prod-group1-alertmanager\" {\n # The \"count\" parameter specifies the number of the task groups that should\n # be running under this group. This value must be non-negative and defaults\n # to 1.\n count = 1\n\n # The constraint allows restricting the set of eligible nodes. Constraints\n # may filter on attributes or client metadata.\n #\n # For more information and examples on the \"volume\" stanza, please see\n # the online documentation at:\n #\n # https://www.nomadproject.io/docs/job-specification/constraint\n #\n constraint {\n attribute = \"${attr.cpu.arch}\"\n operator = \"!=\"\n value = \"arm64\"\n }\n\n # The \"task\" stanza creates an individual unit of work, such as a Docker\n # container, web application, or batch processing.\n #\n # For more information and examples on the \"task\" stanza, please see\n # the online documentation at:\n #\n # https://www.nomadproject.io/docs/job-specification/task\n #\n task \"prod-task1-alertmanager\" {\n # The \"driver\" parameter specifies the task driver that should be used to\n # run the task.\n driver = \"exec\"\n\n \n\n # The \"config\" stanza specifies the driver configuration, which is passed\n # directly to the driver to start the task. The details of configurations\n # are specific to each driver, so please see specific driver\n # documentation for more information.\n config {\n command = \"local/alertmanager-0.21.0.linux-amd64/alertmanager\"\n args = [\n \"--config.file=secrets/alertmanager.yml\"\n ]\n }\n\n # The artifact stanza instructs Nomad to fetch and unpack a remote resource,\n # such as a file, tarball, or binary. Nomad downloads artifacts using the\n # popular go-getter library, which permits downloading artifacts from a\n # variety of locations using a URL as the input source.\n #\n # For more information and examples on the \"artifact\" stanza, please see\n # the online documentation at:\n #\n # https://www.nomadproject.io/docs/job-specification/artifact\n #\n artifact {\n source = \"https://github.com/prometheus/alertmanager/releases/download/v0.21.0/alertmanager-0.21.0.linux-amd64.tar.gz\"\n }\n\n # The \"template\" stanza instructs Nomad to manage a template, such as\n # a configuration file or script. This template can optionally pull data\n # from Consul or Vault to populate runtime configuration data.\n #\n # For more information and examples on the \"template\" stanza, please see\n # the online documentation at:\n #\n # https://www.nomadproject.io/docs/job-specification/template\n #\n template {\n change_mode = \"noop\"\n change_signal = \"SIGINT\"\n destination = \"secrets/alertmanager.yml\"\n left_delimiter = \"{{{\"\n right_delimiter = \"}}}\"\n data = \u003c\u003cEOH\nglobal:\n # The API URL to use for Slack notifications.\n slack_api_url: 'https://hooks.slack.com/services/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'\n\n# The directory from which notification templates are read.\ntemplates:\n- '/etc/alertmanager/template/*.tmpl'\n\n#tls_config:\n# # CA certificate to validate the server certificate with.\n# ca_file: \u003cfilepath\u003e ]\n#\n# # Certificate and key files for client cert authentication to the server.\n# cert_file: \u003cfilepath\u003e\n# key_file: \u003cfilepath\u003e\n#\n# # ServerName extension to indicate the name of the server.\n# # http://tools.ietf.org/html/rfc4366#section-3.1\n# server_name: \u003cstring\u003e\n#\n# # Disable validation of the server certificate.\n# insecure_skip_verify: true\n\n# The root route on which each incoming alert enters.\nroute:\n receiver: 'default-receiver'\n\n # The labels by which incoming alerts are grouped together. For example,\n # multiple alerts coming in for cluster=A and alertname=LatencyHigh would\n # be batched into a single group.\n #\n # To aggregate by all possible labels use '...' as the sole label name.\n # This effectively disables aggregation entirely, passing through all\n # alerts as-is. This is unlikely to be what you want, unless you have\n # a very low alert volume or your upstream notification system performs\n # its own grouping. Example: group_by: [...]\n group_by: ['alertname', 'cluster', 'service']\n\n # When a new group of alerts is created by an incoming alert, wait at\n # least 'group_wait' to send the initial notification.\n # This way ensures that you get multiple alerts for the same group that start\n # firing shortly after another are batched together on the first\n # notification.\n group_wait: 30s\n\n # When the first notification was sent, wait 'group_interval' to send a batch\n # of new alerts that started firing for that group.\n group_interval: 5m\n\n # If an alert has successfully been sent, wait 'repeat_interval' to\n # resend them.\n repeat_interval: 3h\n\n # All the above attributes are inherited by all child routes and can\n # overwritten on each.\n # The child route trees.\n routes:\n # This routes performs a regular expression match on alert labels to\n # catch alerts that are related to a list of services.\n - match_re:\n service: .*\n receiver: default-receiver\n # The service has a sub-route for critical alerts, any alerts\n # that do not match, i.e. severity != critical, fall-back to the\n # parent node and are sent to 'team-X-mails'\n routes:\n - match:\n severity: critical\n receiver: 'default-receiver'\n\n# Inhibition rules allow to mute a set of alerts given that another alert is\n# firing.\n# We use this to mute any warning-level notifications if the same alert is\n# already critical.\ninhibit_rules:\n- source_match:\n severity: 'critical'\n target_match:\n severity: 'warning'\n # Apply inhibition if the alertname is the same.\n # CAUTION:\n # If all label names listed in `equal` are missing\n # from both the source and target alerts,\n # the inhibition rule will apply!\n equal: ['alertname', 'cluster', 'service']\n\nreceivers:\n- name: 'default-receiver'\n slack_configs:\n - channel: '#fdio-infra-monitoring'\n send_resolved: true\n icon_url: https://avatars3.githubusercontent.com/u/3380462\n title: |-\n [{{ .Status | toUpper }}{{ if eq .Status \"firing\" }}:{{ .Alerts.Firing | len }}{{ end }}] {{ .CommonLabels.alertname }} for {{ .CommonLabels.job }}\n {{- if gt (len .CommonLabels) (len .GroupLabels) -}}\n {{\" \"}}(\n {{- with .CommonLabels.Remove .GroupLabels.Names }}\n {{- range $index, $label := .SortedPairs -}}\n {{ if $index }}, {{ end }}\n {{- $label.Name }}=\"{{ $label.Value -}}\"\n {{- end }}\n {{- end -}}\n )\n {{- end }}\n text: \u003e-\n {{ range .Alerts -}}\n *Alert:* {{ .Annotations.summary }}{{ if .Labels.severity }} - `{{ .Labels.severity }}`{{ end }}\n\n *Description:* {{ .Annotations.description }}\n\n *Details:*\n {{ range .Labels.SortedPairs }} • *{{ .Name }}:* `{{ .Value }}`\n {{ end }}\n {{ end }}\nEOH\n }\n\n # The service stanza instructs Nomad to register a service with Consul.\n #\n # For more information and examples on the \"task\" stanza, please see\n # the online documentation at:\n #\n # https://www.nomadproject.io/docs/job-specification/service\n #\n service {\n name = \"alertmanager\"\n port = \"alertmanager\"\n tags = [ \"alertmanager${NOMAD_ALLOC_INDEX}\" ]\n check {\n name = \"Alertmanager Check Live\"\n type = \"http\"\n path = \"/-/healthy\"\n interval = \"10s\"\n timeout = \"2s\"\n }\n }\n\n # The \"resources\" stanza describes the requirements a task needs to\n # execute. Resource requirements include memory, network, cpu, and more.\n # This ensures the task will execute on a machine that contains enough\n # resource capacity.\n #\n # For more information and examples on the \"resources\" stanza, please see\n # the online documentation at:\n #\n # https://www.nomadproject.io/docs/job-specification/resources\n #\n resources {\n cpu = 1000\n memory = 1024\n # The network stanza specifies the networking requirements for the task\n # group, including the network mode and port allocations. When scheduling\n # jobs in Nomad they are provisioned across your fleet of machines along\n # with other jobs and services. Because you don't know in advance what host\n # your job will be provisioned on, Nomad will provide your tasks with\n # network configuration when they start up.\n #\n # For more information and examples on the \"template\" stanza, please see\n # the online documentation at:\n #\n # https://www.nomadproject.io/docs/job-specification/network\n #\n network {\n port \"alertmanager\" {\n static = 9093\n }\n }\n }\n }\n }\n}", + "jobspec": "job \"prod-alertmanager\" {\n # The \"region\" parameter specifies the region in which to execute the job.\n # If omitted, this inherits the default region name of \"global\".\n # region = \"global\"\n #\n # The \"datacenters\" parameter specifies the list of datacenters which should\n # be considered when placing this task. This must be provided.\n datacenters = \"yul1\"\n\n # The \"type\" parameter controls the type of job, which impacts the scheduler's\n # decision on placement. This configuration is optional and defaults to\n # \"service\". For a full list of job types and their differences, please see\n # the online documentation.\n #\n # For more information, please see the online documentation at:\n #\n # https://www.nomadproject.io/docs/jobspec/schedulers\n #\n type = \"service\"\n\n update {\n # The \"max_parallel\" parameter specifies the maximum number of updates to\n # perform in parallel. In this case, this specifies to update a single task\n # at a time.\n max_parallel = 1\n\n health_check = \"checks\"\n\n # The \"min_healthy_time\" parameter specifies the minimum time the allocation\n # must be in the healthy state before it is marked as healthy and unblocks\n # further allocations from being updated.\n min_healthy_time = \"10s\"\n\n # The \"healthy_deadline\" parameter specifies the deadline in which the\n # allocation must be marked as healthy after which the allocation is\n # automatically transitioned to unhealthy. Transitioning to unhealthy will\n # fail the deployment and potentially roll back the job if \"auto_revert\" is\n # set to true.\n healthy_deadline = \"3m\"\n\n # The \"progress_deadline\" parameter specifies the deadline in which an\n # allocation must be marked as healthy. The deadline begins when the first\n # allocation for the deployment is created and is reset whenever an allocation\n # as part of the deployment transitions to a healthy state. If no allocation\n # transitions to the healthy state before the progress deadline, the\n # deployment is marked as failed.\n progress_deadline = \"10m\"\n\n\n # The \"canary\" parameter specifies that changes to the job that would result\n # in destructive updates should create the specified number of canaries\n # without stopping any previous allocations. Once the operator determines the\n # canaries are healthy, they can be promoted which unblocks a rolling update\n # of the remaining allocations at a rate of \"max_parallel\".\n #\n # Further, setting \"canary\" equal to the count of the task group allows\n # blue/green deployments. When the job is updated, a full set of the new\n # version is deployed and upon promotion the old version is stopped.\n canary = 1\n\n # Specifies if the job should auto-promote to the canary version when all\n # canaries become healthy during a deployment. Defaults to false which means\n # canaries must be manually updated with the nomad deployment promote\n # command.\n auto_promote = true\n\n # The \"auto_revert\" parameter specifies if the job should auto-revert to the\n # last stable job on deployment failure. A job is marked as stable if all the\n # allocations as part of its deployment were marked healthy.\n auto_revert = true\n\n }\n\n # The \"group\" stanza defines a series of tasks that should be co-located on\n # the same Nomad client. Any task within a group will be placed on the same\n # client.\n #\n # For more information and examples on the \"group\" stanza, please see\n # the online documentation at:\n #\n # https://www.nomadproject.io/docs/job-specification/group\n #\n group \"prod-group1-alertmanager\" {\n # The \"count\" parameter specifies the number of the task groups that should\n # be running under this group. This value must be non-negative and defaults\n # to 1.\n count = 1\n\n # The constraint allows restricting the set of eligible nodes. Constraints\n # may filter on attributes or client metadata.\n #\n # For more information and examples on the \"volume\" stanza, please see\n # the online documentation at:\n #\n # https://www.nomadproject.io/docs/job-specification/constraint\n #\n constraint {\n attribute = \"${attr.cpu.arch}\"\n operator = \"!=\"\n value = \"arm64\"\n }\n\n # The \"task\" stanza creates an individual unit of work, such as a Docker\n # container, web application, or batch processing.\n #\n # For more information and examples on the \"task\" stanza, please see\n # the online documentation at:\n #\n # https://www.nomadproject.io/docs/job-specification/task\n #\n task \"prod-task1-alertmanager\" {\n # The \"driver\" parameter specifies the task driver that should be used to\n # run the task.\n driver = \"exec\"\n\n \n\n # The \"config\" stanza specifies the driver configuration, which is passed\n # directly to the driver to start the task. The details of configurations\n # are specific to each driver, so please see specific driver\n # documentation for more information.\n config {\n command = \"local/alertmanager-0.21.0.linux-amd64/alertmanager\"\n args = [\n \"--config.file=secrets/alertmanager.yml\"\n ]\n }\n\n # The artifact stanza instructs Nomad to fetch and unpack a remote resource,\n # such as a file, tarball, or binary. Nomad downloads artifacts using the\n # popular go-getter library, which permits downloading artifacts from a\n # variety of locations using a URL as the input source.\n #\n # For more information and examples on the \"artifact\" stanza, please see\n # the online documentation at:\n #\n # https://www.nomadproject.io/docs/job-specification/artifact\n #\n artifact {\n source = \"https://github.com/prometheus/alertmanager/releases/download/v0.21.0/alertmanager-0.21.0.linux-amd64.tar.gz\"\n }\n\n # The \"template\" stanza instructs Nomad to manage a template, such as\n # a configuration file or script. This template can optionally pull data\n # from Consul or Vault to populate runtime configuration data.\n #\n # For more information and examples on the \"template\" stanza, please see\n # the online documentation at:\n #\n # https://www.nomadproject.io/docs/job-specification/template\n #\n template {\n change_mode = \"noop\"\n change_signal = \"SIGINT\"\n destination = \"secrets/alertmanager.yml\"\n left_delimiter = \"{{{\"\n right_delimiter = \"}}}\"\n data = \u003c\u003cEOH\nglobal:\n # The API URL to use for Slack notifications.\n slack_api_url: 'https://hooks.slack.com/services/XX'\n\n# The directory from which notification templates are read.\ntemplates:\n- '/etc/alertmanager/template/*.tmpl'\n\n#tls_config:\n# # CA certificate to validate the server certificate with.\n# ca_file: \u003cfilepath\u003e ]\n#\n# # Certificate and key files for client cert authentication to the server.\n# cert_file: \u003cfilepath\u003e\n# key_file: \u003cfilepath\u003e\n#\n# # ServerName extension to indicate the name of the server.\n# # http://tools.ietf.org/html/rfc4366#section-3.1\n# server_name: \u003cstring\u003e\n#\n# # Disable validation of the server certificate.\n# insecure_skip_verify: true\n\n# The root route on which each incoming alert enters.\nroute:\n receiver: 'default-receiver'\n\n # The labels by which incoming alerts are grouped together. For example,\n # multiple alerts coming in for cluster=A and alertname=LatencyHigh would\n # be batched into a single group.\n #\n # To aggregate by all possible labels use '...' as the sole label name.\n # This effectively disables aggregation entirely, passing through all\n # alerts as-is. This is unlikely to be what you want, unless you have\n # a very low alert volume or your upstream notification system performs\n # its own grouping. Example: group_by: [...]\n group_by: ['alertname']\n\n # When a new group of alerts is created by an incoming alert, wait at\n # least 'group_wait' to send the initial notification.\n # This way ensures that you get multiple alerts for the same group that start\n # firing shortly after another are batched together on the first\n # notification.\n group_wait: 30s\n\n # When the first notification was sent, wait 'group_interval' to send a batch\n # of new alerts that started firing for that group.\n group_interval: 5m\n\n # If an alert has successfully been sent, wait 'repeat_interval' to\n # resend them.\n repeat_interval: 3h\n\n # All the above attributes are inherited by all child routes and can\n # overwritten on each.\n # The child route trees.\n routes:\n # This routes performs a regular expression match on alert labels to\n # catch alerts that are related to a list of services.\n - match_re:\n service: .*\n receiver: default-receiver\n # The service has a sub-route for critical alerts, any alerts\n # that do not match, i.e. severity != critical, fall-back to the\n # parent node and are sent to 'team-X-mails'\n routes:\n - match:\n severity: critical\n receiver: 'default-receiver'\n\n# Inhibition rules allow to mute a set of alerts given that another alert is\n# firing.\n# We use this to mute any warning-level notifications if the same alert is\n# already critical.\ninhibit_rules:\n- source_match:\n severity: 'critical'\n target_match:\n severity: 'warning'\n # Apply inhibition if the alertname is the same.\n # CAUTION:\n # If all label names listed in `equal` are missing\n # from both the source and target alerts,\n # the inhibition rule will apply!\n equal: ['alertname', 'cluster', 'service']\n\nreceivers:\n- name: 'default-receiver'\n slack_configs:\n - channel: '#fdio-infra-monitoring'\n send_resolved: true\n icon_url: https://avatars3.githubusercontent.com/u/3380462\n title: |-\n [{{ .Status | toUpper }}{{ if eq .Status \"firing\" }}:{{ .Alerts.Firing | len }}{{ end }}] {{ .CommonLabels.alertname }} for {{ .CommonLabels.job }}\n {{- if gt (len .CommonLabels) (len .GroupLabels) -}}\n {{\" \"}}(\n {{- with .CommonLabels.Remove .GroupLabels.Names }}\n {{- range $index, $label := .SortedPairs -}}\n {{ if $index }}, {{ end }}\n {{- $label.Name }}=\"{{ $label.Value -}}\"\n {{- end }}\n {{- end -}}\n )\n {{- end }}\n text: \u003e-\n {{ range .Alerts -}}\n *Alert:* {{ .Annotations.summary }}{{ if .Labels.severity }} - `{{ .Labels.severity }}`{{ end }}\n\n *Description:* {{ .Annotations.description }}\n\n *Details:*\n {{ range .Labels.SortedPairs }} • *{{ .Name }}:* `{{ .Value }}`\n {{ end }}\n {{ end }}\nEOH\n }\n\n # The service stanza instructs Nomad to register a service with Consul.\n #\n # For more information and examples on the \"task\" stanza, please see\n # the online documentation at:\n #\n # https://www.nomadproject.io/docs/job-specification/service\n #\n service {\n name = \"alertmanager\"\n port = \"alertmanager\"\n tags = [ \"alertmanager${NOMAD_ALLOC_INDEX}\" ]\n check {\n name = \"Alertmanager Check Live\"\n type = \"http\"\n path = \"/-/healthy\"\n interval = \"10s\"\n timeout = \"2s\"\n }\n }\n\n # The \"resources\" stanza describes the requirements a task needs to\n # execute. Resource requirements include memory, network, cpu, and more.\n # This ensures the task will execute on a machine that contains enough\n # resource capacity.\n #\n # For more information and examples on the \"resources\" stanza, please see\n # the online documentation at:\n #\n # https://www.nomadproject.io/docs/job-specification/resources\n #\n resources {\n cpu = 1000\n memory = 1024\n # The network stanza specifies the networking requirements for the task\n # group, including the network mode and port allocations. When scheduling\n # jobs in Nomad they are provisioned across your fleet of machines along\n # with other jobs and services. Because you don't know in advance what host\n # your job will be provisioned on, Nomad will provide your tasks with\n # network configuration when they start up.\n #\n # For more information and examples on the \"template\" stanza, please see\n # the online documentation at:\n #\n # https://www.nomadproject.io/docs/job-specification/network\n #\n network {\n port \"alertmanager\" {\n static = 9093\n }\n }\n }\n }\n }\n}", "json": null, - "modify_index": "7148912", + "modify_index": "7202773", "name": "prod-alertmanager", "namespace": "default", "policy_override": null, @@ -98,156 +98,6 @@ ] }, { - "module": "module.exporter", - "mode": "data", - "type": "template_file", - "name": "nomad_job_exporter", - "provider": "provider[\"registry.terraform.io/hashicorp/template\"]", - "instances": [ - { - "schema_version": 0, - "attributes": { - "filename": null, - "id": "28db119a83617686c94496ed114455041d4fc500d93c4de8f8b8cfbe7d4c1db2", - "rendered": "job \"prod-exporter\" {\n # The \"region\" parameter specifies the region in which to execute the job.\n # If omitted, this inherits the default region name of \"global\".\n # region = \"global\"\n #\n # The \"datacenters\" parameter specifies the list of datacenters which should\n # be considered when placing this task. This must be provided.\n datacenters = \"yul1\"\n\n # The \"type\" parameter controls the type of job, which impacts the scheduler's\n # decision on placement. This configuration is optional and defaults to\n # \"service\". For a full list of job types and their differences, please see\n # the online documentation.\n #\n # https://www.nomadproject.io/docs/jobspec/schedulers\n #\n type = \"system\"\n\n update {\n # The \"max_parallel\" parameter specifies the maximum number of updates to\n # perform in parallel. In this case, this specifies to update a single task\n # at a time.\n max_parallel = 1\n\n health_check = \"checks\"\n\n # The \"min_healthy_time\" parameter specifies the minimum time the allocation\n # must be in the healthy state before it is marked as healthy and unblocks\n # further allocations from being updated.\n min_healthy_time = \"10s\"\n\n # The \"healthy_deadline\" parameter specifies the deadline in which the\n # allocation must be marked as healthy after which the allocation is\n # automatically transitioned to unhealthy. Transitioning to unhealthy will\n # fail the deployment and potentially roll back the job if \"auto_revert\" is\n # set to true.\n healthy_deadline = \"3m\"\n\n # The \"progress_deadline\" parameter specifies the deadline in which an\n # allocation must be marked as healthy. The deadline begins when the first\n # allocation for the deployment is created and is reset whenever an allocation\n # as part of the deployment transitions to a healthy state. If no allocation\n # transitions to the healthy state before the progress deadline, the\n # deployment is marked as failed.\n progress_deadline = \"10m\"\n\n\n }\n\n # The \"group\" stanza defines a series of tasks that should be co-located on\n # the same Nomad client. Any task within a group will be placed on the same\n # client.\n #\n # https://www.nomadproject.io/docs/job-specification/group\n #\n group \"prod-group1-exporter-amd64\" {\n # The constraint allows restricting the set of eligible nodes. Constraints\n # may filter on attributes or client metadata.\n #\n # https://www.nomadproject.io/docs/job-specification/constraint\n #\n constraint {\n attribute = \"${attr.cpu.arch}\"\n operator = \"!=\"\n value = \"arm64\"\n }\n\n # The \"task\" stanza creates an individual unit of work, such as a Docker\n # container, web application, or batch processing.\n #\n # https://www.nomadproject.io/docs/job-specification/task\n #\n task \"prod-task1-nodeexporter-amd64\" {\n # The \"driver\" parameter specifies the task driver that should be used to\n # run the task.\n driver = \"raw_exec\"\n\n # The \"config\" stanza specifies the driver configuration, which is passed\n # directly to the driver to start the task. The details of configurations\n # are specific to each driver, so please see specific driver\n # documentation for more information.\n config {\n command = \"local/node_exporter-1.0.1.linux-amd64/node_exporter\"\n }\n\n # The artifact stanza instructs Nomad to fetch and unpack a remote resource,\n # such as a file, tarball, or binary. Nomad downloads artifacts using the\n # popular go-getter library, which permits downloading artifacts from a\n # variety of locations using a URL as the input source.\n #\n # https://www.nomadproject.io/docs/job-specification/artifact\n #\n artifact {\n source = \"https://github.com/prometheus/node_exporter/releases/download/v1.0.1/node_exporter-1.0.1.linux-amd64.tar.gz\"\n }\n\n # The service stanza instructs Nomad to register a service with Consul.\n #\n # https://www.nomadproject.io/docs/job-specification/service\n #\n service {\n name = \"nodeexporter\"\n port = \"nodeexporter\"\n check {\n name = \"Node Exporter Check Live\"\n type = \"http\"\n path = \"/metrics\"\n interval = \"10s\"\n timeout = \"2s\"\n }\n }\n\n # The \"resources\" stanza describes the requirements a task needs to\n # execute. Resource requirements include memory, network, cpu, and more.\n # This ensures the task will execute on a machine that contains enough\n # resource capacity.\n #\n # https://www.nomadproject.io/docs/job-specification/resources\n #\n resources {\n cpu = 500\n # The network stanza specifies the networking requirements for the task\n # group, including the network mode and port allocations. When scheduling\n # jobs in Nomad they are provisioned across your fleet of machines along\n # with other jobs and services. Because you don't know in advance what host\n # your job will be provisioned on, Nomad will provide your tasks with\n # network configuration when they start up.\n #\n # https://www.nomadproject.io/docs/job-specification/network\n #\n network {\n port \"nodeexporter\" {\n static = 9100\n }\n }\n }\n }\n task \"prod-task2-blackboxexporter-amd64\" {\n # The \"driver\" parameter specifies the task driver that should be used to\n # run the task.\n driver = \"exec\"\n\n # The \"config\" stanza specifies the driver configuration, which is passed\n # directly to the driver to start the task. The details of configurations\n # are specific to each driver, so please see specific driver\n # documentation for more information.\n config {\n command = \"local/blackbox_exporter-0.18.0.linux-amd64/blackbox_exporter\"\n args = [\n \"--config.file=secrets/blackbox.yml\"\n ]\n }\n\n # The \"template\" stanza instructs Nomad to manage a template, such as\n # a configuration file or script. This template can optionally pull data\n # from Consul or Vault to populate runtime configuration data.\n #\n # https://www.nomadproject.io/docs/job-specification/template\n #\n template {\n change_mode = \"noop\"\n change_signal = \"SIGINT\"\n destination = \"secrets/blackbox.yml\"\n data = \u003c\u003cEOH\nmodules:\n http_2xx:\n prober: http\n timeout: 5s\n http:\n valid_http_versions: [\"HTTP/1.1\", \"HTTP/2.0\"]\n no_follow_redirects: false\n fail_if_ssl: false\n fail_if_not_ssl: true\n tls_config:\n insecure_skip_verify: false\n preferred_ip_protocol: \"ip4\"\n icmp_v4:\n prober: icmp\n timeout: 5s\n icmp:\n preferred_ip_protocol: \"ip4\"\n dns_udp:\n prober: dns\n timeout: 5s\n dns:\n query_name: \"jenkins.fd.io\"\n query_type: \"A\"\n valid_rcodes:\n - NOERROR\nEOH\n }\n\n # The artifact stanza instructs Nomad to fetch and unpack a remote resource,\n # such as a file, tarball, or binary. Nomad downloads artifacts using the\n # popular go-getter library, which permits downloading artifacts from a\n # variety of locations using a URL as the input source.\n #\n # https://www.nomadproject.io/docs/job-specification/artifact\n #\n artifact {\n source = \"https://github.com/prometheus/blackbox_exporter/releases/download/v0.18.0/blackbox_exporter-0.18.0.linux-amd64.tar.gz\"\n }\n\n # The service stanza instructs Nomad to register a service with Consul.\n #\n # https://www.nomadproject.io/docs/job-specification/service\n #\n service {\n name = \"blackboxexporter\"\n port = \"blackboxexporter\"\n tags = [ \"blackboxexporter${NOMAD_ALLOC_INDEX}\" ]\n check {\n name = \"Blackbox Exporter Check Live\"\n type = \"http\"\n path = \"/metrics\"\n interval = \"10s\"\n timeout = \"2s\"\n }\n }\n\n # The \"resources\" stanza describes the requirements a task needs to\n # execute. Resource requirements include memory, network, cpu, and more.\n # This ensures the task will execute on a machine that contains enough\n # resource capacity.\n #\n # https://www.nomadproject.io/docs/job-specification/resources\n #\n resources {\n cpu = 500\n # The network stanza specifies the networking requirements for the task\n # group, including the network mode and port allocations. When scheduling\n # jobs in Nomad they are provisioned across your fleet of machines along\n # with other jobs and services. Because you don't know in advance what host\n # your job will be provisioned on, Nomad will provide your tasks with\n # network configuration when they start up.\n #\n # https://www.nomadproject.io/docs/job-specification/network\n #\n network {\n port \"blackboxexporter\" {\n static = 9115\n }\n }\n }\n }\n\n task \"prod-task3-cadvisorexporter-amd64\" {\n # The \"driver\" parameter specifies the task driver that should be used to\n # run the task.\n driver = \"docker\"\n\n # The \"config\" stanza specifies the driver configuration, which is passed\n # directly to the driver to start the task. The details of configurations\n # are specific to each driver, so please see specific driver\n # documentation for more information.\n config {\n image = \"gcr.io/cadvisor/cadvisor:latest\"\n volumes = [\n \"/:/rootfs:ro\",\n \"/var/run:/var/run:rw\",\n \"/sys:/sys:ro\",\n \"/var/lib/docker/:/var/lib/docker:ro\",\n \"/cgroup:/cgroup:ro\"\n ]\n }\n\n # The service stanza instructs Nomad to register a service with Consul.\n #\n # https://www.nomadproject.io/docs/job-specification/service\n #\n service {\n name = \"cadvisorexporter\"\n port = \"cadvisorexporter\"\n check {\n name = \"cAdvisor Check Live\"\n type = \"http\"\n path = \"/metrics\"\n interval = \"10s\"\n timeout = \"2s\"\n }\n }\n\n # The \"resources\" stanza describes the requirements a task needs to\n # execute. Resource requirements include memory, network, cpu, and more.\n # This ensures the task will execute on a machine that contains enough\n # resource capacity.\n #\n # https://www.nomadproject.io/docs/job-specification/resources\n #\n resources {\n cpu = 500\n # The network stanza specifies the networking requirements for the task\n # group, including the network mode and port allocations. When scheduling\n # jobs in Nomad they are provisioned across your fleet of machines along\n # with other jobs and services. Because you don't know in advance what host\n # your job will be provisioned on, Nomad will provide your tasks with\n # network configuration when they start up.\n #\n # https://www.nomadproject.io/docs/job-specification/network\n #\n network {\n port \"cadvisorexporter\" {\n static = 8080\n }\n }\n }\n }\n }\n\n group \"prod-group1-exporter-arm64\" {\n # The constraint allows restricting the set of eligible nodes. Constraints\n # may filter on attributes or client metadata.\n #\n # https://www.nomadproject.io/docs/job-specification/constraint\n #\n constraint {\n attribute = \"${attr.cpu.arch}\"\n operator = \"==\"\n value = \"arm64\"\n }\n\n # The \"task\" stanza creates an individual unit of work, such as a Docker\n # container, web application, or batch processing.\n #\n # https://www.nomadproject.io/docs/job-specification/task\n #\n task \"prod-task1-nodeexporter-arm64\" {\n # The \"driver\" parameter specifies the task driver that should be used to\n # run the task.\n driver = \"raw_exec\"\n\n # The \"config\" stanza specifies the driver configuration, which is passed\n # directly to the driver to start the task. The details of configurations\n # are specific to each driver, so please see specific driver\n # documentation for more information.\n config {\n command = \"local/node_exporter-1.0.1.linux-arm64/node_exporter\"\n }\n\n # The artifact stanza instructs Nomad to fetch and unpack a remote resource,\n # such as a file, tarball, or binary. Nomad downloads artifacts using the\n # popular go-getter library, which permits downloading artifacts from a\n # variety of locations using a URL as the input source.\n #\n # https://www.nomadproject.io/docs/job-specification/artifact\n #\n artifact {\n source = \"https://github.com/prometheus/node_exporter/releases/download/v1.0.1/node_exporter-1.0.1.linux-arm64.tar.gz\"\n }\n\n # The service stanza instructs Nomad to register a service with Consul.\n #\n # https://www.nomadproject.io/docs/job-specification/service\n #\n service {\n name = \"nodeexporter\"\n port = \"nodeexporter\"\n check {\n name = \"Node Exporter Check Live\"\n type = \"http\"\n path = \"/metrics\"\n interval = \"10s\"\n timeout = \"2s\"\n }\n }\n\n # The \"resources\" stanza describes the requirements a task needs to\n # execute. Resource requirements include memory, network, cpu, and more.\n # This ensures the task will execute on a machine that contains enough\n # resource capacity.\n #\n # https://www.nomadproject.io/docs/job-specification/resources\n #\n resources {\n cpu = 500\n # The network stanza specifies the networking requirements for the task\n # group, including the network mode and port allocations. When scheduling\n # jobs in Nomad they are provisioned across your fleet of machines along\n # with other jobs and services. Because you don't know in advance what host\n # your job will be provisioned on, Nomad will provide your tasks with\n # network configuration when they start up.\n #\n # https://www.nomadproject.io/docs/job-specification/network\n #\n network {\n port \"nodeexporter\" {\n static = 9100\n }\n }\n }\n }\n\n task \"prod-task2-blackboxexporter-arm64\" {\n # The \"driver\" parameter specifies the task driver that should be used to\n # run the task.\n driver = \"exec\"\n\n # The \"config\" stanza specifies the driver configuration, which is passed\n # directly to the driver to start the task. The details of configurations\n # are specific to each driver, so please see specific driver\n # documentation for more information.\n config {\n command = \"local/blackbox_exporter-0.18.0.linux-arm64/blackbox_exporter\"\n args = [\n \"--config.file=secrets/blackbox.yml\"\n ]\n }\n\n # The \"template\" stanza instructs Nomad to manage a template, such as\n # a configuration file or script. This template can optionally pull data\n # from Consul or Vault to populate runtime configuration data.\n #\n # https://www.nomadproject.io/docs/job-specification/template\n #\n template {\n change_mode = \"noop\"\n change_signal = \"SIGINT\"\n destination = \"secrets/blackbox.yml\"\n data = \u003c\u003cEOH\nmodules:\n http_2xx:\n prober: http\n timeout: 5s\n http:\n valid_http_versions: [\"HTTP/1.1\", \"HTTP/2.0\"]\n no_follow_redirects: false\n fail_if_ssl: false\n fail_if_not_ssl: true\n tls_config:\n insecure_skip_verify: false\n preferred_ip_protocol: \"ip4\"\n icmp_v4:\n prober: icmp\n timeout: 5s\n icmp:\n preferred_ip_protocol: \"ip4\"\n dns_udp:\n prober: dns\n timeout: 5s\n dns:\n query_name: \"jenkins.fd.io\"\n query_type: \"A\"\n valid_rcodes:\n - NOERROR\nEOH\n }\n\n # The artifact stanza instructs Nomad to fetch and unpack a remote resource,\n # such as a file, tarball, or binary. Nomad downloads artifacts using the\n # popular go-getter library, which permits downloading artifacts from a\n # variety of locations using a URL as the input source.\n #\n # https://www.nomadproject.io/docs/job-specification/artifact\n #\n artifact {\n source = \"https://github.com/prometheus/blackbox_exporter/releases/download/v0.18.0/blackbox_exporter-0.18.0.linux-arm64.tar.gz\"\n }\n\n # The service stanza instructs Nomad to register a service with Consul.\n #\n # https://www.nomadproject.io/docs/job-specification/service\n #\n service {\n name = \"blackboxexporter\"\n port = \"blackboxexporter\"\n tags = [ \"blackboxexporter${NOMAD_ALLOC_INDEX}\" ]\n check {\n name = \"Blackbox Exporter Check Live\"\n type = \"http\"\n path = \"/metrics\"\n interval = \"10s\"\n timeout = \"2s\"\n }\n }\n\n # The \"resources\" stanza describes the requirements a task needs to\n # execute. Resource requirements include memory, network, cpu, and more.\n # This ensures the task will execute on a machine that contains enough\n # resource capacity.\n #\n # https://www.nomadproject.io/docs/job-specification/resources\n #\n resources {\n cpu = 500\n # The network stanza specifies the networking requirements for the task\n # group, including the network mode and port allocations. When scheduling\n # jobs in Nomad they are provisioned across your fleet of machines along\n # with other jobs and services. Because you don't know in advance what host\n # your job will be provisioned on, Nomad will provide your tasks with\n # network configuration when they start up.\n #\n # https://www.nomadproject.io/docs/job-specification/network\n #\n network {\n port \"blackboxexporter\" {\n static = 9115\n }\n }\n }\n }\n\n task \"prod-task3-cadvisorexporter-arm64\" {\n # The \"driver\" parameter specifies the task driver that should be used to\n # run the task.\n driver = \"docker\"\n\n # The \"config\" stanza specifies the driver configuration, which is passed\n # directly to the driver to start the task. The details of configurations\n # are specific to each driver, so please see specific driver\n # documentation for more information.\n config {\n # There is currently no official release for arm yet...using community.\n image = \"zcube/cadvisor:latest\"\n volumes = [\n \"/:/rootfs:ro\",\n \"/var/run:/var/run:rw\",\n \"/sys:/sys:ro\",\n \"/var/lib/docker/:/var/lib/docker:ro\",\n \"/cgroup:/cgroup:ro\"\n ]\n }\n\n # The service stanza instructs Nomad to register a service with Consul.\n #\n # https://www.nomadproject.io/docs/job-specification/service\n #\n service {\n name = \"cadvisorexporter\"\n port = \"cadvisorexporter\"\n check {\n name = \"cAdvisor Check Live\"\n type = \"http\"\n path = \"/metrics\"\n interval = \"10s\"\n timeout = \"2s\"\n }\n }\n\n # The \"resources\" stanza describes the requirements a task needs to\n # execute. Resource requirements include memory, network, cpu, and more.\n # This ensures the task will execute on a machine that contains enough\n # resource capacity.\n #\n # https://www.nomadproject.io/docs/job-specification/resources\n #\n resources {\n cpu = 500\n # The network stanza specifies the networking requirements for the task\n # group, including the network mode and port allocations. When scheduling\n # jobs in Nomad they are provisioned across your fleet of machines along\n # with other jobs and services. Because you don't know in advance what host\n # your job will be provisioned on, Nomad will provide your tasks with\n # network configuration when they start up.\n #\n # https://www.nomadproject.io/docs/job-specification/network\n #\n network {\n port \"cadvisorexporter\" {\n static = 8080\n }\n }\n }\n }\n }\n}", - "template": "job \"${job_name}\" {\n # The \"region\" parameter specifies the region in which to execute the job.\n # If omitted, this inherits the default region name of \"global\".\n # region = \"global\"\n #\n # The \"datacenters\" parameter specifies the list of datacenters which should\n # be considered when placing this task. This must be provided.\n datacenters = \"${datacenters}\"\n\n # The \"type\" parameter controls the type of job, which impacts the scheduler's\n # decision on placement. This configuration is optional and defaults to\n # \"service\". For a full list of job types and their differences, please see\n # the online documentation.\n #\n # https://www.nomadproject.io/docs/jobspec/schedulers\n #\n type = \"system\"\n\n update {\n # The \"max_parallel\" parameter specifies the maximum number of updates to\n # perform in parallel. In this case, this specifies to update a single task\n # at a time.\n max_parallel = 1\n\n health_check = \"checks\"\n\n # The \"min_healthy_time\" parameter specifies the minimum time the allocation\n # must be in the healthy state before it is marked as healthy and unblocks\n # further allocations from being updated.\n min_healthy_time = \"10s\"\n\n # The \"healthy_deadline\" parameter specifies the deadline in which the\n # allocation must be marked as healthy after which the allocation is\n # automatically transitioned to unhealthy. Transitioning to unhealthy will\n # fail the deployment and potentially roll back the job if \"auto_revert\" is\n # set to true.\n healthy_deadline = \"3m\"\n\n # The \"progress_deadline\" parameter specifies the deadline in which an\n # allocation must be marked as healthy. The deadline begins when the first\n # allocation for the deployment is created and is reset whenever an allocation\n # as part of the deployment transitions to a healthy state. If no allocation\n # transitions to the healthy state before the progress deadline, the\n # deployment is marked as failed.\n progress_deadline = \"10m\"\n\n%{ if use_canary }\n # The \"canary\" parameter specifies that changes to the job that would result\n # in destructive updates should create the specified number of canaries\n # without stopping any previous allocations. Once the operator determines the\n # canaries are healthy, they can be promoted which unblocks a rolling update\n # of the remaining allocations at a rate of \"max_parallel\".\n #\n # Further, setting \"canary\" equal to the count of the task group allows\n # blue/green deployments. When the job is updated, a full set of the new\n # version is deployed and upon promotion the old version is stopped.\n canary = 1\n\n # Specifies if the job should auto-promote to the canary version when all\n # canaries become healthy during a deployment. Defaults to false which means\n # canaries must be manually updated with the nomad deployment promote\n # command.\n auto_promote = true\n\n # The \"auto_revert\" parameter specifies if the job should auto-revert to the\n # last stable job on deployment failure. A job is marked as stable if all the\n # allocations as part of its deployment were marked healthy.\n auto_revert = true\n%{ endif }\n }\n\n # The \"group\" stanza defines a series of tasks that should be co-located on\n # the same Nomad client. Any task within a group will be placed on the same\n # client.\n #\n # https://www.nomadproject.io/docs/job-specification/group\n #\n group \"prod-group1-exporter-amd64\" {\n # The constraint allows restricting the set of eligible nodes. Constraints\n # may filter on attributes or client metadata.\n #\n # https://www.nomadproject.io/docs/job-specification/constraint\n #\n constraint {\n attribute = \"$${attr.cpu.arch}\"\n operator = \"!=\"\n value = \"arm64\"\n }\n\n # The \"task\" stanza creates an individual unit of work, such as a Docker\n # container, web application, or batch processing.\n #\n # https://www.nomadproject.io/docs/job-specification/task\n #\n task \"prod-task1-${node_service_name}-amd64\" {\n # The \"driver\" parameter specifies the task driver that should be used to\n # run the task.\n driver = \"raw_exec\"\n\n # The \"config\" stanza specifies the driver configuration, which is passed\n # directly to the driver to start the task. The details of configurations\n # are specific to each driver, so please see specific driver\n # documentation for more information.\n config {\n command = \"local/node_exporter-${node_version}.linux-amd64/node_exporter\"\n }\n\n # The artifact stanza instructs Nomad to fetch and unpack a remote resource,\n # such as a file, tarball, or binary. Nomad downloads artifacts using the\n # popular go-getter library, which permits downloading artifacts from a\n # variety of locations using a URL as the input source.\n #\n # https://www.nomadproject.io/docs/job-specification/artifact\n #\n artifact {\n source = \"${node_url_amd64}\"\n }\n\n # The service stanza instructs Nomad to register a service with Consul.\n #\n # https://www.nomadproject.io/docs/job-specification/service\n #\n service {\n name = \"${node_service_name}\"\n port = \"${node_service_name}\"\n check {\n name = \"Node Exporter Check Live\"\n type = \"http\"\n path = \"/metrics\"\n interval = \"10s\"\n timeout = \"2s\"\n }\n }\n\n # The \"resources\" stanza describes the requirements a task needs to\n # execute. Resource requirements include memory, network, cpu, and more.\n # This ensures the task will execute on a machine that contains enough\n # resource capacity.\n #\n # https://www.nomadproject.io/docs/job-specification/resources\n #\n resources {\n cpu = 500\n # The network stanza specifies the networking requirements for the task\n # group, including the network mode and port allocations. When scheduling\n # jobs in Nomad they are provisioned across your fleet of machines along\n # with other jobs and services. Because you don't know in advance what host\n # your job will be provisioned on, Nomad will provide your tasks with\n # network configuration when they start up.\n #\n # https://www.nomadproject.io/docs/job-specification/network\n #\n network {\n port \"${node_service_name}\" {\n static = ${node_port}\n }\n }\n }\n }\n task \"prod-task2-${blackbox_service_name}-amd64\" {\n # The \"driver\" parameter specifies the task driver that should be used to\n # run the task.\n driver = \"exec\"\n\n # The \"config\" stanza specifies the driver configuration, which is passed\n # directly to the driver to start the task. The details of configurations\n # are specific to each driver, so please see specific driver\n # documentation for more information.\n config {\n command = \"local/blackbox_exporter-${blackbox_version}.linux-amd64/blackbox_exporter\"\n args = [\n \"--config.file=secrets/blackbox.yml\"\n ]\n }\n\n # The \"template\" stanza instructs Nomad to manage a template, such as\n # a configuration file or script. This template can optionally pull data\n # from Consul or Vault to populate runtime configuration data.\n #\n # https://www.nomadproject.io/docs/job-specification/template\n #\n template {\n change_mode = \"noop\"\n change_signal = \"SIGINT\"\n destination = \"secrets/blackbox.yml\"\n data = \u003c\u003cEOH\nmodules:\n http_2xx:\n prober: http\n timeout: 5s\n http:\n valid_http_versions: [\"HTTP/1.1\", \"HTTP/2.0\"]\n no_follow_redirects: false\n fail_if_ssl: false\n fail_if_not_ssl: true\n tls_config:\n insecure_skip_verify: false\n preferred_ip_protocol: \"ip4\"\n icmp_v4:\n prober: icmp\n timeout: 5s\n icmp:\n preferred_ip_protocol: \"ip4\"\n dns_udp:\n prober: dns\n timeout: 5s\n dns:\n query_name: \"jenkins.fd.io\"\n query_type: \"A\"\n valid_rcodes:\n - NOERROR\nEOH\n }\n\n # The artifact stanza instructs Nomad to fetch and unpack a remote resource,\n # such as a file, tarball, or binary. Nomad downloads artifacts using the\n # popular go-getter library, which permits downloading artifacts from a\n # variety of locations using a URL as the input source.\n #\n # https://www.nomadproject.io/docs/job-specification/artifact\n #\n artifact {\n source = \"${blackbox_url_amd64}\"\n }\n\n # The service stanza instructs Nomad to register a service with Consul.\n #\n # https://www.nomadproject.io/docs/job-specification/service\n #\n service {\n name = \"${blackbox_service_name}\"\n port = \"${blackbox_service_name}\"\n tags = [ \"${blackbox_service_name}$${NOMAD_ALLOC_INDEX}\" ]\n check {\n name = \"Blackbox Exporter Check Live\"\n type = \"http\"\n path = \"/metrics\"\n interval = \"10s\"\n timeout = \"2s\"\n }\n }\n\n # The \"resources\" stanza describes the requirements a task needs to\n # execute. Resource requirements include memory, network, cpu, and more.\n # This ensures the task will execute on a machine that contains enough\n # resource capacity.\n #\n # https://www.nomadproject.io/docs/job-specification/resources\n #\n resources {\n cpu = 500\n # The network stanza specifies the networking requirements for the task\n # group, including the network mode and port allocations. When scheduling\n # jobs in Nomad they are provisioned across your fleet of machines along\n # with other jobs and services. Because you don't know in advance what host\n # your job will be provisioned on, Nomad will provide your tasks with\n # network configuration when they start up.\n #\n # https://www.nomadproject.io/docs/job-specification/network\n #\n network {\n port \"${blackbox_service_name}\" {\n static = ${blackbox_port}\n }\n }\n }\n }\n\n task \"prod-task3-${cadvisor_service_name}-amd64\" {\n # The \"driver\" parameter specifies the task driver that should be used to\n # run the task.\n driver = \"docker\"\n\n # The \"config\" stanza specifies the driver configuration, which is passed\n # directly to the driver to start the task. The details of configurations\n # are specific to each driver, so please see specific driver\n # documentation for more information.\n config {\n image = \"${cadvisor_image}\"\n volumes = [\n \"/:/rootfs:ro\",\n \"/var/run:/var/run:rw\",\n \"/sys:/sys:ro\",\n \"/var/lib/docker/:/var/lib/docker:ro\",\n \"/cgroup:/cgroup:ro\"\n ]\n }\n\n # The service stanza instructs Nomad to register a service with Consul.\n #\n # https://www.nomadproject.io/docs/job-specification/service\n #\n service {\n name = \"${cadvisor_service_name}\"\n port = \"${cadvisor_service_name}\"\n check {\n name = \"cAdvisor Check Live\"\n type = \"http\"\n path = \"/metrics\"\n interval = \"10s\"\n timeout = \"2s\"\n }\n }\n\n # The \"resources\" stanza describes the requirements a task needs to\n # execute. Resource requirements include memory, network, cpu, and more.\n # This ensures the task will execute on a machine that contains enough\n # resource capacity.\n #\n # https://www.nomadproject.io/docs/job-specification/resources\n #\n resources {\n cpu = 500\n # The network stanza specifies the networking requirements for the task\n # group, including the network mode and port allocations. When scheduling\n # jobs in Nomad they are provisioned across your fleet of machines along\n # with other jobs and services. Because you don't know in advance what host\n # your job will be provisioned on, Nomad will provide your tasks with\n # network configuration when they start up.\n #\n # https://www.nomadproject.io/docs/job-specification/network\n #\n network {\n port \"${cadvisor_service_name}\" {\n static = ${cadvisor_port}\n }\n }\n }\n }\n }\n\n group \"prod-group1-exporter-arm64\" {\n # The constraint allows restricting the set of eligible nodes. Constraints\n # may filter on attributes or client metadata.\n #\n # https://www.nomadproject.io/docs/job-specification/constraint\n #\n constraint {\n attribute = \"$${attr.cpu.arch}\"\n operator = \"==\"\n value = \"arm64\"\n }\n\n # The \"task\" stanza creates an individual unit of work, such as a Docker\n # container, web application, or batch processing.\n #\n # https://www.nomadproject.io/docs/job-specification/task\n #\n task \"prod-task1-${node_service_name}-arm64\" {\n # The \"driver\" parameter specifies the task driver that should be used to\n # run the task.\n driver = \"raw_exec\"\n\n # The \"config\" stanza specifies the driver configuration, which is passed\n # directly to the driver to start the task. The details of configurations\n # are specific to each driver, so please see specific driver\n # documentation for more information.\n config {\n command = \"local/node_exporter-${node_version}.linux-arm64/node_exporter\"\n }\n\n # The artifact stanza instructs Nomad to fetch and unpack a remote resource,\n # such as a file, tarball, or binary. Nomad downloads artifacts using the\n # popular go-getter library, which permits downloading artifacts from a\n # variety of locations using a URL as the input source.\n #\n # https://www.nomadproject.io/docs/job-specification/artifact\n #\n artifact {\n source = \"${node_url_arm64}\"\n }\n\n # The service stanza instructs Nomad to register a service with Consul.\n #\n # https://www.nomadproject.io/docs/job-specification/service\n #\n service {\n name = \"${node_service_name}\"\n port = \"${node_service_name}\"\n check {\n name = \"Node Exporter Check Live\"\n type = \"http\"\n path = \"/metrics\"\n interval = \"10s\"\n timeout = \"2s\"\n }\n }\n\n # The \"resources\" stanza describes the requirements a task needs to\n # execute. Resource requirements include memory, network, cpu, and more.\n # This ensures the task will execute on a machine that contains enough\n # resource capacity.\n #\n # https://www.nomadproject.io/docs/job-specification/resources\n #\n resources {\n cpu = 500\n # The network stanza specifies the networking requirements for the task\n # group, including the network mode and port allocations. When scheduling\n # jobs in Nomad they are provisioned across your fleet of machines along\n # with other jobs and services. Because you don't know in advance what host\n # your job will be provisioned on, Nomad will provide your tasks with\n # network configuration when they start up.\n #\n # https://www.nomadproject.io/docs/job-specification/network\n #\n network {\n port \"${node_service_name}\" {\n static = ${node_port}\n }\n }\n }\n }\n\n task \"prod-task2-${blackbox_service_name}-arm64\" {\n # The \"driver\" parameter specifies the task driver that should be used to\n # run the task.\n driver = \"exec\"\n\n # The \"config\" stanza specifies the driver configuration, which is passed\n # directly to the driver to start the task. The details of configurations\n # are specific to each driver, so please see specific driver\n # documentation for more information.\n config {\n command = \"local/blackbox_exporter-${blackbox_version}.linux-arm64/blackbox_exporter\"\n args = [\n \"--config.file=secrets/blackbox.yml\"\n ]\n }\n\n # The \"template\" stanza instructs Nomad to manage a template, such as\n # a configuration file or script. This template can optionally pull data\n # from Consul or Vault to populate runtime configuration data.\n #\n # https://www.nomadproject.io/docs/job-specification/template\n #\n template {\n change_mode = \"noop\"\n change_signal = \"SIGINT\"\n destination = \"secrets/blackbox.yml\"\n data = \u003c\u003cEOH\nmodules:\n http_2xx:\n prober: http\n timeout: 5s\n http:\n valid_http_versions: [\"HTTP/1.1\", \"HTTP/2.0\"]\n no_follow_redirects: false\n fail_if_ssl: false\n fail_if_not_ssl: true\n tls_config:\n insecure_skip_verify: false\n preferred_ip_protocol: \"ip4\"\n icmp_v4:\n prober: icmp\n timeout: 5s\n icmp:\n preferred_ip_protocol: \"ip4\"\n dns_udp:\n prober: dns\n timeout: 5s\n dns:\n query_name: \"jenkins.fd.io\"\n query_type: \"A\"\n valid_rcodes:\n - NOERROR\nEOH\n }\n\n # The artifact stanza instructs Nomad to fetch and unpack a remote resource,\n # such as a file, tarball, or binary. Nomad downloads artifacts using the\n # popular go-getter library, which permits downloading artifacts from a\n # variety of locations using a URL as the input source.\n #\n # https://www.nomadproject.io/docs/job-specification/artifact\n #\n artifact {\n source = \"${blackbox_url_arm64}\"\n }\n\n # The service stanza instructs Nomad to register a service with Consul.\n #\n # https://www.nomadproject.io/docs/job-specification/service\n #\n service {\n name = \"${blackbox_service_name}\"\n port = \"${blackbox_service_name}\"\n tags = [ \"${blackbox_service_name}$${NOMAD_ALLOC_INDEX}\" ]\n check {\n name = \"Blackbox Exporter Check Live\"\n type = \"http\"\n path = \"/metrics\"\n interval = \"10s\"\n timeout = \"2s\"\n }\n }\n\n # The \"resources\" stanza describes the requirements a task needs to\n # execute. Resource requirements include memory, network, cpu, and more.\n # This ensures the task will execute on a machine that contains enough\n # resource capacity.\n #\n # https://www.nomadproject.io/docs/job-specification/resources\n #\n resources {\n cpu = 500\n # The network stanza specifies the networking requirements for the task\n # group, including the network mode and port allocations. When scheduling\n # jobs in Nomad they are provisioned across your fleet of machines along\n # with other jobs and services. Because you don't know in advance what host\n # your job will be provisioned on, Nomad will provide your tasks with\n # network configuration when they start up.\n #\n # https://www.nomadproject.io/docs/job-specification/network\n #\n network {\n port \"${blackbox_service_name}\" {\n static = ${blackbox_port}\n }\n }\n }\n }\n\n task \"prod-task3-${cadvisor_service_name}-arm64\" {\n # The \"driver\" parameter specifies the task driver that should be used to\n # run the task.\n driver = \"docker\"\n\n # The \"config\" stanza specifies the driver configuration, which is passed\n # directly to the driver to start the task. The details of configurations\n # are specific to each driver, so please see specific driver\n # documentation for more information.\n config {\n # There is currently no official release for arm yet...using community.\n image = \"zcube/cadvisor:latest\"\n volumes = [\n \"/:/rootfs:ro\",\n \"/var/run:/var/run:rw\",\n \"/sys:/sys:ro\",\n \"/var/lib/docker/:/var/lib/docker:ro\",\n \"/cgroup:/cgroup:ro\"\n ]\n }\n\n # The service stanza instructs Nomad to register a service with Consul.\n #\n # https://www.nomadproject.io/docs/job-specification/service\n #\n service {\n name = \"${cadvisor_service_name}\"\n port = \"${cadvisor_service_name}\"\n check {\n name = \"cAdvisor Check Live\"\n type = \"http\"\n path = \"/metrics\"\n interval = \"10s\"\n timeout = \"2s\"\n }\n }\n\n # The \"resources\" stanza describes the requirements a task needs to\n # execute. Resource requirements include memory, network, cpu, and more.\n # This ensures the task will execute on a machine that contains enough\n # resource capacity.\n #\n # https://www.nomadproject.io/docs/job-specification/resources\n #\n resources {\n cpu = 500\n # The network stanza specifies the networking requirements for the task\n # group, including the network mode and port allocations. When scheduling\n # jobs in Nomad they are provisioned across your fleet of machines along\n # with other jobs and services. Because you don't know in advance what host\n # your job will be provisioned on, Nomad will provide your tasks with\n # network configuration when they start up.\n #\n # https://www.nomadproject.io/docs/job-specification/network\n #\n network {\n port \"${cadvisor_service_name}\" {\n static = ${cadvisor_port}\n }\n }\n }\n }\n }\n}", - "vars": { - "blackbox_port": "9115", - "blackbox_service_name": "blackboxexporter", - "blackbox_url_amd64": "https://github.com/prometheus/blackbox_exporter/releases/download/v0.18.0/blackbox_exporter-0.18.0.linux-amd64.tar.gz", - "blackbox_url_arm64": "https://github.com/prometheus/blackbox_exporter/releases/download/v0.18.0/blackbox_exporter-0.18.0.linux-arm64.tar.gz", - "blackbox_version": "0.18.0", - "cadvisor_image": "gcr.io/cadvisor/cadvisor:latest", - "cadvisor_port": "8080", - "cadvisor_service_name": "cadvisorexporter", - "datacenters": "yul1", - "job_name": "prod-exporter", - "node_port": "9100", - "node_service_name": "nodeexporter", - "node_url_amd64": "https://github.com/prometheus/node_exporter/releases/download/v1.0.1/node_exporter-1.0.1.linux-amd64.tar.gz", - "node_url_arm64": "https://github.com/prometheus/node_exporter/releases/download/v1.0.1/node_exporter-1.0.1.linux-arm64.tar.gz", - "node_version": "1.0.1", - "use_canary": "false" - } - }, - "sensitive_attributes": [] - } - ] - }, - { - "module": "module.exporter", - "mode": "managed", - "type": "nomad_job", - "name": "nomad_job_exporter", - "provider": "provider[\"registry.terraform.io/hashicorp/nomad\"].yul1", - "instances": [ - { - "schema_version": 0, - "attributes": { - "allocation_ids": [ - "e11d14cf-0b12-7f11-f982-e75abb2fa245", - "be1d7792-8b38-0769-1899-4d78ac17d775", - "33190f8d-6b97-5a79-32c3-c2a48eb33cab", - "7b9a89f3-3094-73aa-de9e-b674cdf3962b", - "1b730294-e8c6-d330-72a6-7502ef6d8195", - "3b3932ce-d57e-d0c6-0d36-b32724d83a0f", - "d07855d6-29a8-dba5-92a8-bed5673c3b81", - "4ea7fa69-4819-5050-37ea-bbce995e994d", - "2c694f1a-fb0e-67b9-407a-dd5f1d72daa0", - "52e828d4-79a7-9e43-2cdb-1ba6088f3257", - "e23fc3eb-1699-e1e8-629b-82a21b72ecf4", - "6f83b8c0-4313-eb5a-9c22-e8db2226e832", - "15ee749c-07f0-2864-f907-919faa62cee2", - "12025e7c-4a27-9a9b-b0aa-a205179139eb", - "a6234b96-8c7b-1fd4-e85f-d361774d3005", - "fadf8b49-0e1c-98ea-42b5-119667fa092e", - "2cb8905c-3d36-878d-917e-493f8a982970", - "a1426c7b-a4a1-0f19-cc11-162b396e9ce9", - "f24eee7b-e480-3386-9a6e-357edb9ae137", - "7b7c32b2-9cc9-bb99-b203-2842e4495f5c" - ], - "datacenters": [ - "yul1" - ], - "deployment_id": "", - "deployment_status": "", - "deregister_on_destroy": true, - "deregister_on_id_change": true, - "detach": false, - "id": "prod-exporter", - "jobspec": "job \"prod-exporter\" {\n # The \"region\" parameter specifies the region in which to execute the job.\n # If omitted, this inherits the default region name of \"global\".\n # region = \"global\"\n #\n # The \"datacenters\" parameter specifies the list of datacenters which should\n # be considered when placing this task. This must be provided.\n datacenters = \"yul1\"\n\n # The \"type\" parameter controls the type of job, which impacts the scheduler's\n # decision on placement. This configuration is optional and defaults to\n # \"service\". For a full list of job types and their differences, please see\n # the online documentation.\n #\n # https://www.nomadproject.io/docs/jobspec/schedulers\n #\n type = \"system\"\n\n update {\n # The \"max_parallel\" parameter specifies the maximum number of updates to\n # perform in parallel. In this case, this specifies to update a single task\n # at a time.\n max_parallel = 1\n\n health_check = \"checks\"\n\n # The \"min_healthy_time\" parameter specifies the minimum time the allocation\n # must be in the healthy state before it is marked as healthy and unblocks\n # further allocations from being updated.\n min_healthy_time = \"10s\"\n\n # The \"healthy_deadline\" parameter specifies the deadline in which the\n # allocation must be marked as healthy after which the allocation is\n # automatically transitioned to unhealthy. Transitioning to unhealthy will\n # fail the deployment and potentially roll back the job if \"auto_revert\" is\n # set to true.\n healthy_deadline = \"3m\"\n\n # The \"progress_deadline\" parameter specifies the deadline in which an\n # allocation must be marked as healthy. The deadline begins when the first\n # allocation for the deployment is created and is reset whenever an allocation\n # as part of the deployment transitions to a healthy state. If no allocation\n # transitions to the healthy state before the progress deadline, the\n # deployment is marked as failed.\n progress_deadline = \"10m\"\n\n\n }\n\n # The \"group\" stanza defines a series of tasks that should be co-located on\n # the same Nomad client. Any task within a group will be placed on the same\n # client.\n #\n # https://www.nomadproject.io/docs/job-specification/group\n #\n group \"prod-group1-exporter-amd64\" {\n # The constraint allows restricting the set of eligible nodes. Constraints\n # may filter on attributes or client metadata.\n #\n # https://www.nomadproject.io/docs/job-specification/constraint\n #\n constraint {\n attribute = \"${attr.cpu.arch}\"\n operator = \"!=\"\n value = \"arm64\"\n }\n\n # The \"task\" stanza creates an individual unit of work, such as a Docker\n # container, web application, or batch processing.\n #\n # https://www.nomadproject.io/docs/job-specification/task\n #\n task \"prod-task1-nodeexporter-amd64\" {\n # The \"driver\" parameter specifies the task driver that should be used to\n # run the task.\n driver = \"raw_exec\"\n\n # The \"config\" stanza specifies the driver configuration, which is passed\n # directly to the driver to start the task. The details of configurations\n # are specific to each driver, so please see specific driver\n # documentation for more information.\n config {\n command = \"local/node_exporter-1.0.1.linux-amd64/node_exporter\"\n }\n\n # The artifact stanza instructs Nomad to fetch and unpack a remote resource,\n # such as a file, tarball, or binary. Nomad downloads artifacts using the\n # popular go-getter library, which permits downloading artifacts from a\n # variety of locations using a URL as the input source.\n #\n # https://www.nomadproject.io/docs/job-specification/artifact\n #\n artifact {\n source = \"https://github.com/prometheus/node_exporter/releases/download/v1.0.1/node_exporter-1.0.1.linux-amd64.tar.gz\"\n }\n\n # The service stanza instructs Nomad to register a service with Consul.\n #\n # https://www.nomadproject.io/docs/job-specification/service\n #\n service {\n name = \"nodeexporter\"\n port = \"nodeexporter\"\n check {\n name = \"Node Exporter Check Live\"\n type = \"http\"\n path = \"/metrics\"\n interval = \"10s\"\n timeout = \"2s\"\n }\n }\n\n # The \"resources\" stanza describes the requirements a task needs to\n # execute. Resource requirements include memory, network, cpu, and more.\n # This ensures the task will execute on a machine that contains enough\n # resource capacity.\n #\n # https://www.nomadproject.io/docs/job-specification/resources\n #\n resources {\n cpu = 500\n # The network stanza specifies the networking requirements for the task\n # group, including the network mode and port allocations. When scheduling\n # jobs in Nomad they are provisioned across your fleet of machines along\n # with other jobs and services. Because you don't know in advance what host\n # your job will be provisioned on, Nomad will provide your tasks with\n # network configuration when they start up.\n #\n # https://www.nomadproject.io/docs/job-specification/network\n #\n network {\n port \"nodeexporter\" {\n static = 9100\n }\n }\n }\n }\n task \"prod-task2-blackboxexporter-amd64\" {\n # The \"driver\" parameter specifies the task driver that should be used to\n # run the task.\n driver = \"exec\"\n\n # The \"config\" stanza specifies the driver configuration, which is passed\n # directly to the driver to start the task. The details of configurations\n # are specific to each driver, so please see specific driver\n # documentation for more information.\n config {\n command = \"local/blackbox_exporter-0.18.0.linux-amd64/blackbox_exporter\"\n args = [\n \"--config.file=secrets/blackbox.yml\"\n ]\n }\n\n # The \"template\" stanza instructs Nomad to manage a template, such as\n # a configuration file or script. This template can optionally pull data\n # from Consul or Vault to populate runtime configuration data.\n #\n # https://www.nomadproject.io/docs/job-specification/template\n #\n template {\n change_mode = \"noop\"\n change_signal = \"SIGINT\"\n destination = \"secrets/blackbox.yml\"\n data = \u003c\u003cEOH\nmodules:\n http_2xx:\n prober: http\n timeout: 5s\n http:\n valid_http_versions: [\"HTTP/1.1\", \"HTTP/2.0\"]\n no_follow_redirects: false\n fail_if_ssl: false\n fail_if_not_ssl: true\n tls_config:\n insecure_skip_verify: false\n preferred_ip_protocol: \"ip4\"\n icmp_v4:\n prober: icmp\n timeout: 5s\n icmp:\n preferred_ip_protocol: \"ip4\"\n dns_udp:\n prober: dns\n timeout: 5s\n dns:\n query_name: \"jenkins.fd.io\"\n query_type: \"A\"\n valid_rcodes:\n - NOERROR\nEOH\n }\n\n # The artifact stanza instructs Nomad to fetch and unpack a remote resource,\n # such as a file, tarball, or binary. Nomad downloads artifacts using the\n # popular go-getter library, which permits downloading artifacts from a\n # variety of locations using a URL as the input source.\n #\n # https://www.nomadproject.io/docs/job-specification/artifact\n #\n artifact {\n source = \"https://github.com/prometheus/blackbox_exporter/releases/download/v0.18.0/blackbox_exporter-0.18.0.linux-amd64.tar.gz\"\n }\n\n # The service stanza instructs Nomad to register a service with Consul.\n #\n # https://www.nomadproject.io/docs/job-specification/service\n #\n service {\n name = \"blackboxexporter\"\n port = \"blackboxexporter\"\n tags = [ \"blackboxexporter${NOMAD_ALLOC_INDEX}\" ]\n check {\n name = \"Blackbox Exporter Check Live\"\n type = \"http\"\n path = \"/metrics\"\n interval = \"10s\"\n timeout = \"2s\"\n }\n }\n\n # The \"resources\" stanza describes the requirements a task needs to\n # execute. Resource requirements include memory, network, cpu, and more.\n # This ensures the task will execute on a machine that contains enough\n # resource capacity.\n #\n # https://www.nomadproject.io/docs/job-specification/resources\n #\n resources {\n cpu = 500\n # The network stanza specifies the networking requirements for the task\n # group, including the network mode and port allocations. When scheduling\n # jobs in Nomad they are provisioned across your fleet of machines along\n # with other jobs and services. Because you don't know in advance what host\n # your job will be provisioned on, Nomad will provide your tasks with\n # network configuration when they start up.\n #\n # https://www.nomadproject.io/docs/job-specification/network\n #\n network {\n port \"blackboxexporter\" {\n static = 9115\n }\n }\n }\n }\n\n task \"prod-task3-cadvisorexporter-amd64\" {\n # The \"driver\" parameter specifies the task driver that should be used to\n # run the task.\n driver = \"docker\"\n\n # The \"config\" stanza specifies the driver configuration, which is passed\n # directly to the driver to start the task. The details of configurations\n # are specific to each driver, so please see specific driver\n # documentation for more information.\n config {\n image = \"gcr.io/cadvisor/cadvisor:latest\"\n volumes = [\n \"/:/rootfs:ro\",\n \"/var/run:/var/run:rw\",\n \"/sys:/sys:ro\",\n \"/var/lib/docker/:/var/lib/docker:ro\",\n \"/cgroup:/cgroup:ro\"\n ]\n }\n\n # The service stanza instructs Nomad to register a service with Consul.\n #\n # https://www.nomadproject.io/docs/job-specification/service\n #\n service {\n name = \"cadvisorexporter\"\n port = \"cadvisorexporter\"\n check {\n name = \"cAdvisor Check Live\"\n type = \"http\"\n path = \"/metrics\"\n interval = \"10s\"\n timeout = \"2s\"\n }\n }\n\n # The \"resources\" stanza describes the requirements a task needs to\n # execute. Resource requirements include memory, network, cpu, and more.\n # This ensures the task will execute on a machine that contains enough\n # resource capacity.\n #\n # https://www.nomadproject.io/docs/job-specification/resources\n #\n resources {\n cpu = 500\n # The network stanza specifies the networking requirements for the task\n # group, including the network mode and port allocations. When scheduling\n # jobs in Nomad they are provisioned across your fleet of machines along\n # with other jobs and services. Because you don't know in advance what host\n # your job will be provisioned on, Nomad will provide your tasks with\n # network configuration when they start up.\n #\n # https://www.nomadproject.io/docs/job-specification/network\n #\n network {\n port \"cadvisorexporter\" {\n static = 8080\n }\n }\n }\n }\n }\n\n group \"prod-group1-exporter-arm64\" {\n # The constraint allows restricting the set of eligible nodes. Constraints\n # may filter on attributes or client metadata.\n #\n # https://www.nomadproject.io/docs/job-specification/constraint\n #\n constraint {\n attribute = \"${attr.cpu.arch}\"\n operator = \"==\"\n value = \"arm64\"\n }\n\n # The \"task\" stanza creates an individual unit of work, such as a Docker\n # container, web application, or batch processing.\n #\n # https://www.nomadproject.io/docs/job-specification/task\n #\n task \"prod-task1-nodeexporter-arm64\" {\n # The \"driver\" parameter specifies the task driver that should be used to\n # run the task.\n driver = \"raw_exec\"\n\n # The \"config\" stanza specifies the driver configuration, which is passed\n # directly to the driver to start the task. The details of configurations\n # are specific to each driver, so please see specific driver\n # documentation for more information.\n config {\n command = \"local/node_exporter-1.0.1.linux-arm64/node_exporter\"\n }\n\n # The artifact stanza instructs Nomad to fetch and unpack a remote resource,\n # such as a file, tarball, or binary. Nomad downloads artifacts using the\n # popular go-getter library, which permits downloading artifacts from a\n # variety of locations using a URL as the input source.\n #\n # https://www.nomadproject.io/docs/job-specification/artifact\n #\n artifact {\n source = \"https://github.com/prometheus/node_exporter/releases/download/v1.0.1/node_exporter-1.0.1.linux-arm64.tar.gz\"\n }\n\n # The service stanza instructs Nomad to register a service with Consul.\n #\n # https://www.nomadproject.io/docs/job-specification/service\n #\n service {\n name = \"nodeexporter\"\n port = \"nodeexporter\"\n check {\n name = \"Node Exporter Check Live\"\n type = \"http\"\n path = \"/metrics\"\n interval = \"10s\"\n timeout = \"2s\"\n }\n }\n\n # The \"resources\" stanza describes the requirements a task needs to\n # execute. Resource requirements include memory, network, cpu, and more.\n # This ensures the task will execute on a machine that contains enough\n # resource capacity.\n #\n # https://www.nomadproject.io/docs/job-specification/resources\n #\n resources {\n cpu = 500\n # The network stanza specifies the networking requirements for the task\n # group, including the network mode and port allocations. When scheduling\n # jobs in Nomad they are provisioned across your fleet of machines along\n # with other jobs and services. Because you don't know in advance what host\n # your job will be provisioned on, Nomad will provide your tasks with\n # network configuration when they start up.\n #\n # https://www.nomadproject.io/docs/job-specification/network\n #\n network {\n port \"nodeexporter\" {\n static = 9100\n }\n }\n }\n }\n\n task \"prod-task2-blackboxexporter-arm64\" {\n # The \"driver\" parameter specifies the task driver that should be used to\n # run the task.\n driver = \"exec\"\n\n # The \"config\" stanza specifies the driver configuration, which is passed\n # directly to the driver to start the task. The details of configurations\n # are specific to each driver, so please see specific driver\n # documentation for more information.\n config {\n command = \"local/blackbox_exporter-0.18.0.linux-arm64/blackbox_exporter\"\n args = [\n \"--config.file=secrets/blackbox.yml\"\n ]\n }\n\n # The \"template\" stanza instructs Nomad to manage a template, such as\n # a configuration file or script. This template can optionally pull data\n # from Consul or Vault to populate runtime configuration data.\n #\n # https://www.nomadproject.io/docs/job-specification/template\n #\n template {\n change_mode = \"noop\"\n change_signal = \"SIGINT\"\n destination = \"secrets/blackbox.yml\"\n data = \u003c\u003cEOH\nmodules:\n http_2xx:\n prober: http\n timeout: 5s\n http:\n valid_http_versions: [\"HTTP/1.1\", \"HTTP/2.0\"]\n no_follow_redirects: false\n fail_if_ssl: false\n fail_if_not_ssl: true\n tls_config:\n insecure_skip_verify: false\n preferred_ip_protocol: \"ip4\"\n icmp_v4:\n prober: icmp\n timeout: 5s\n icmp:\n preferred_ip_protocol: \"ip4\"\n dns_udp:\n prober: dns\n timeout: 5s\n dns:\n query_name: \"jenkins.fd.io\"\n query_type: \"A\"\n valid_rcodes:\n - NOERROR\nEOH\n }\n\n # The artifact stanza instructs Nomad to fetch and unpack a remote resource,\n # such as a file, tarball, or binary. Nomad downloads artifacts using the\n # popular go-getter library, which permits downloading artifacts from a\n # variety of locations using a URL as the input source.\n #\n # https://www.nomadproject.io/docs/job-specification/artifact\n #\n artifact {\n source = \"https://github.com/prometheus/blackbox_exporter/releases/download/v0.18.0/blackbox_exporter-0.18.0.linux-arm64.tar.gz\"\n }\n\n # The service stanza instructs Nomad to register a service with Consul.\n #\n # https://www.nomadproject.io/docs/job-specification/service\n #\n service {\n name = \"blackboxexporter\"\n port = \"blackboxexporter\"\n tags = [ \"blackboxexporter${NOMAD_ALLOC_INDEX}\" ]\n check {\n name = \"Blackbox Exporter Check Live\"\n type = \"http\"\n path = \"/metrics\"\n interval = \"10s\"\n timeout = \"2s\"\n }\n }\n\n # The \"resources\" stanza describes the requirements a task needs to\n # execute. Resource requirements include memory, network, cpu, and more.\n # This ensures the task will execute on a machine that contains enough\n # resource capacity.\n #\n # https://www.nomadproject.io/docs/job-specification/resources\n #\n resources {\n cpu = 500\n # The network stanza specifies the networking requirements for the task\n # group, including the network mode and port allocations. When scheduling\n # jobs in Nomad they are provisioned across your fleet of machines along\n # with other jobs and services. Because you don't know in advance what host\n # your job will be provisioned on, Nomad will provide your tasks with\n # network configuration when they start up.\n #\n # https://www.nomadproject.io/docs/job-specification/network\n #\n network {\n port \"blackboxexporter\" {\n static = 9115\n }\n }\n }\n }\n\n task \"prod-task3-cadvisorexporter-arm64\" {\n # The \"driver\" parameter specifies the task driver that should be used to\n # run the task.\n driver = \"docker\"\n\n # The \"config\" stanza specifies the driver configuration, which is passed\n # directly to the driver to start the task. The details of configurations\n # are specific to each driver, so please see specific driver\n # documentation for more information.\n config {\n # There is currently no official release for arm yet...using community.\n image = \"zcube/cadvisor:latest\"\n volumes = [\n \"/:/rootfs:ro\",\n \"/var/run:/var/run:rw\",\n \"/sys:/sys:ro\",\n \"/var/lib/docker/:/var/lib/docker:ro\",\n \"/cgroup:/cgroup:ro\"\n ]\n }\n\n # The service stanza instructs Nomad to register a service with Consul.\n #\n # https://www.nomadproject.io/docs/job-specification/service\n #\n service {\n name = \"cadvisorexporter\"\n port = \"cadvisorexporter\"\n check {\n name = \"cAdvisor Check Live\"\n type = \"http\"\n path = \"/metrics\"\n interval = \"10s\"\n timeout = \"2s\"\n }\n }\n\n # The \"resources\" stanza describes the requirements a task needs to\n # execute. Resource requirements include memory, network, cpu, and more.\n # This ensures the task will execute on a machine that contains enough\n # resource capacity.\n #\n # https://www.nomadproject.io/docs/job-specification/resources\n #\n resources {\n cpu = 500\n # The network stanza specifies the networking requirements for the task\n # group, including the network mode and port allocations. When scheduling\n # jobs in Nomad they are provisioned across your fleet of machines along\n # with other jobs and services. Because you don't know in advance what host\n # your job will be provisioned on, Nomad will provide your tasks with\n # network configuration when they start up.\n #\n # https://www.nomadproject.io/docs/job-specification/network\n #\n network {\n port \"cadvisorexporter\" {\n static = 8080\n }\n }\n }\n }\n }\n}", - "json": null, - "modify_index": "7147927", - "name": "prod-exporter", - "namespace": "default", - "policy_override": null, - "purge_on_destroy": null, - "region": "global", - "task_groups": [ - { - "count": 1, - "meta": {}, - "name": "prod-group1-exporter-amd64", - "task": [ - { - "driver": "raw_exec", - "meta": {}, - "name": "prod-task1-nodeexporter-amd64", - "volume_mounts": [] - }, - { - "driver": "exec", - "meta": {}, - "name": "prod-task2-blackboxexporter-amd64", - "volume_mounts": [] - }, - { - "driver": "docker", - "meta": {}, - "name": "prod-task3-cadvisorexporter-amd64", - "volume_mounts": [] - } - ], - "volumes": [] - }, - { - "count": 1, - "meta": {}, - "name": "prod-group1-exporter-arm64", - "task": [ - { - "driver": "raw_exec", - "meta": {}, - "name": "prod-task1-nodeexporter-arm64", - "volume_mounts": [] - }, - { - "driver": "exec", - "meta": {}, - "name": "prod-task2-blackboxexporter-arm64", - "volume_mounts": [] - }, - { - "driver": "docker", - "meta": {}, - "name": "prod-task3-cadvisorexporter-arm64", - "volume_mounts": [] - } - ], - "volumes": [] - } - ], - "type": "system" - }, - "sensitive_attributes": [], - "private": "bnVsbA==", - "dependencies": [ - "module.exporter.data.template_file.nomad_job_exporter" - ] - } - ] - }, - { "module": "module.grafana", "mode": "data", "type": "template_file", @@ -288,7 +138,9 @@ { "schema_version": 0, "attributes": { - "allocation_ids": null, + "allocation_ids": [ + "1ddd68d2-ab33-a727-c667-1713435b506b" + ], "datacenters": [ "yul1" ], @@ -416,7 +268,7 @@ "schema_version": 0, "attributes": { "allocation_ids": [ - "3ff45a9a-4635-3719-ae12-c750ebb3e800" + "7e9cea8d-a5e5-47db-7f9c-653c59c93928" ], "datacenters": [ "yul1" @@ -429,7 +281,7 @@ "id": "prod-mc", "jobspec": "job \"prod-mc\" {\n # The \"region\" parameter specifies the region in which to execute the job.\n # If omitted, this inherits the default region name of \"global\".\n # region = \"global\"\n #\n # The \"datacenters\" parameter specifies the list of datacenters which should\n # be considered when placing this task. This must be provided.\n datacenters = \"yul1\"\n\n # The \"type\" parameter controls the type of job, which impacts the scheduler's\n # decision on placement. This configuration is optional and defaults to\n # \"service\". For a full list of job types and their differences, please see\n # the online documentation.\n #\n # For more information, please see the online documentation at:\n #\n # https://www.nomadproject.io/docs/jobspec/schedulers.html\n #\n type = \"batch\"\n\n # The \"group\" stanza defines a series of tasks that should be co-located on\n # the same Nomad client. Any task within a group will be placed on the same\n # client.\n #\n # For more information and examples on the \"group\" stanza, please see\n # the online documentation at:\n #\n # https://www.nomadproject.io/docs/job-specification/group.html\n #\n group \"prod-group1-mc\" {\n task \"prod-task1-create-buckets\" {\n # The \"driver\" parameter specifies the task driver that should be used to\n # run the task.\n driver = \"docker\"\n\n \n\n # The \"config\" stanza specifies the driver configuration, which is passed\n # directly to the driver to start the task. The details of configurations\n # are specific to each driver, so please see specific driver\n # documentation for more information.\n config {\n image = \"minio/mc:RELEASE.2020-12-10T01-26-17Z\"\n entrypoint = [\n \"/bin/sh\",\n \"-c\",\n \"mc config host add LOCALMINIO http://storage.service.consul:9000 $MINIO_ACCESS_KEY $MINIO_SECRET_KEY \u0026\u0026 mc mb -p LOCALMINIO/logs.fd.io LOCALMINIO/docs.fd.io ; mc policy set public LOCALMINIO/logs.fd.io mc policy set public LOCALMINIO/docs.fd.io mc ilm add --expiry-days '180' LOCALMINIO/logs.fd.io mc admin user add LOCALMINIO storage Storage1234 mc admin policy set LOCALMINIO writeonly user=storage\"\n ]\n dns_servers = [ \"${attr.unique.network.ip-address}\" ]\n privileged = false\n }\n\n # The env stanza configures a list of environment variables to populate\n # the task's environment before starting.\n env {\n \n MINIO_ACCESS_KEY = \"minio\"\n MINIO_SECRET_KEY = \"minio123\"\n \n \n }\n }\n }\n}\n", "json": null, - "modify_index": "7148928", + "modify_index": "7202809", "name": "prod-mc", "namespace": "default", "policy_override": null, @@ -474,15 +326,15 @@ "schema_version": 0, "attributes": { "allocation_ids": [ - "322e6ea5-acc7-2f37-3022-cd0a31c2f3db", - "619833b9-dd70-46ae-f576-75c3cd058a13", - "0510561a-c828-dd23-5910-b92d9f2c41ef", - "504924c8-387b-abe2-c4d5-d6a7424a4689" + "72adbff3-2fd7-b3f8-9f19-be9511f91a26", + "fcc12ad6-caed-632d-1c4a-14fe0dc9741e", + "89fcea10-d56b-ab07-e2b8-435c118e800b", + "1fe6b306-ee08-415c-fbef-cdcdf3fbc2ec" ], "datacenters": [ "yul1" ], - "deployment_id": "cd6d79cf-6e6f-289d-a0bb-6b2bb8da769a", + "deployment_id": "93fcf42d-2ab9-b10b-9184-b99e08621504", "deployment_status": "successful", "deregister_on_destroy": true, "deregister_on_id_change": true, @@ -569,7 +421,7 @@ "schema_version": 0, "attributes": { "allocation_ids": [ - "1dea6de0-d5cd-2a1b-fb42-0468e986e0d6" + "f576ae38-8398-dfa2-bc6c-18355896ed2f" ], "datacenters": [ "yul1" @@ -625,9 +477,9 @@ "schema_version": 0, "attributes": { "filename": null, - "id": "cd26d062785357d96c95d6f8f40b2d8d7b430be3399de176e6b250822c4af86c", - "rendered": "job \"prod-prometheus\" {\n # The \"region\" parameter specifies the region in which to execute the job.\n # If omitted, this inherits the default region name of \"global\".\n # region = \"global\"\n #\n # The \"datacenters\" parameter specifies the list of datacenters which should\n # be considered when placing this task. This must be provided.\n datacenters = \"yul1\"\n\n # The \"type\" parameter controls the type of job, which impacts the scheduler's\n # decision on placement. This configuration is optional and defaults to\n # \"service\". For a full list of job types and their differences, please see\n # the online documentation.\n #\n # For more information, please see the online documentation at:\n #\n # https://www.nomadproject.io/docs/jobspec/schedulers\n #\n type = \"service\"\n\n update {\n # The \"max_parallel\" parameter specifies the maximum number of updates to\n # perform in parallel. In this case, this specifies to update a single task\n # at a time.\n max_parallel = 1\n\n health_check = \"checks\"\n\n # The \"min_healthy_time\" parameter specifies the minimum time the allocation\n # must be in the healthy state before it is marked as healthy and unblocks\n # further allocations from being updated.\n min_healthy_time = \"10s\"\n\n # The \"healthy_deadline\" parameter specifies the deadline in which the\n # allocation must be marked as healthy after which the allocation is\n # automatically transitioned to unhealthy. Transitioning to unhealthy will\n # fail the deployment and potentially roll back the job if \"auto_revert\" is\n # set to true.\n healthy_deadline = \"3m\"\n\n # The \"progress_deadline\" parameter specifies the deadline in which an\n # allocation must be marked as healthy. The deadline begins when the first\n # allocation for the deployment is created and is reset whenever an allocation\n # as part of the deployment transitions to a healthy state. If no allocation\n # transitions to the healthy state before the progress deadline, the\n # deployment is marked as failed.\n progress_deadline = \"10m\"\n\n\n # The \"canary\" parameter specifies that changes to the job that would result\n # in destructive updates should create the specified number of canaries\n # without stopping any previous allocations. Once the operator determines the\n # canaries are healthy, they can be promoted which unblocks a rolling update\n # of the remaining allocations at a rate of \"max_parallel\".\n #\n # Further, setting \"canary\" equal to the count of the task group allows\n # blue/green deployments. When the job is updated, a full set of the new\n # version is deployed and upon promotion the old version is stopped.\n canary = 1\n\n # Specifies if the job should auto-promote to the canary version when all\n # canaries become healthy during a deployment. Defaults to false which means\n # canaries must be manually updated with the nomad deployment promote\n # command.\n auto_promote = true\n\n # The \"auto_revert\" parameter specifies if the job should auto-revert to the\n # last stable job on deployment failure. A job is marked as stable if all the\n # allocations as part of its deployment were marked healthy.\n auto_revert = true\n\n }\n\n # The \"group\" stanza defines a series of tasks that should be co-located on\n # the same Nomad client. Any task within a group will be placed on the same\n # client.\n #\n # For more information and examples on the \"group\" stanza, please see\n # the online documentation at:\n #\n # https://www.nomadproject.io/docs/job-specification/group\n #\n group \"prod-group1-prometheus\" {\n # The \"count\" parameter specifies the number of the task groups that should\n # be running under this group. This value must be non-negative and defaults\n # to 1.\n count = 4\n\n # The volume stanza allows the group to specify that it requires a given\n # volume from the cluster.\n #\n # For more information and examples on the \"volume\" stanza, please see\n # the online documentation at:\n #\n # https://www.nomadproject.io/docs/job-specification/volume\n #\n \n volume \"prod-volume1-prometheus\" {\n type = \"host\"\n read_only = false\n source = \"prod-volume-data1-1\"\n }\n \n\n # The constraint allows restricting the set of eligible nodes. Constraints\n # may filter on attributes or client metadata.\n #\n # For more information and examples on the \"volume\" stanza, please see\n # the online documentation at:\n #\n # https://www.nomadproject.io/docs/job-specification/constraint\n #\n constraint {\n attribute = \"${attr.cpu.arch}\"\n operator = \"!=\"\n value = \"arm64\"\n }\n\n # The \"task\" stanza creates an individual unit of work, such as a Docker\n # container, web application, or batch processing.\n #\n # For more information and examples on the \"task\" stanza, please see\n # the online documentation at:\n #\n # https://www.nomadproject.io/docs/job-specification/task\n #\n task \"prod-task1-prometheus\" {\n # The \"driver\" parameter specifies the task driver that should be used to\n # run the task.\n driver = \"exec\"\n\n \n volume_mount {\n volume = \"prod-volume1-prometheus\"\n destination = \"/data/\"\n read_only = false\n }\n \n\n \n\n # The \"config\" stanza specifies the driver configuration, which is passed\n # directly to the driver to start the task. The details of configurations\n # are specific to each driver, so please see specific driver\n # documentation for more information.\n config {\n command = \"local/prometheus-2.24.0.linux-amd64/prometheus\"\n args = [\n \"--config.file=secrets/prometheus.yml\",\n \"--storage.tsdb.path=/data/prometheus/\",\n \"--storage.tsdb.retention.time=15d\"\n ]\n }\n\n # The artifact stanza instructs Nomad to fetch and unpack a remote resource,\n # such as a file, tarball, or binary. Nomad downloads artifacts using the\n # popular go-getter library, which permits downloading artifacts from a\n # variety of locations using a URL as the input source.\n #\n # For more information and examples on the \"artifact\" stanza, please see\n # the online documentation at:\n #\n # https://www.nomadproject.io/docs/job-specification/artifact\n #\n artifact {\n source = \"https://github.com/prometheus/prometheus/releases/download/v2.24.0/prometheus-2.24.0.linux-amd64.tar.gz\"\n }\n\n # The \"template\" stanza instructs Nomad to manage a template, such as\n # a configuration file or script. This template can optionally pull data\n # from Consul or Vault to populate runtime configuration data.\n #\n # For more information and examples on the \"template\" stanza, please see\n # the online documentation at:\n #\n # https://www.nomadproject.io/docs/job-specification/template\n #\n template {\n change_mode = \"noop\"\n change_signal = \"SIGINT\"\n destination = \"secrets/alerts.yml\"\n left_delimiter = \"{{{\"\n right_delimiter = \"}}}\"\n data = \u003c\u003cEOH\n---\ngroups:\n- name: \"Consul\"\n rules:\n - alert: ConsulServiceHealthcheckFailed\n expr: consul_catalog_service_node_healthy == 0\n for: 0m\n labels:\n severity: critical\n annotations:\n summary: \"Consul service healthcheck failed (instance {{ $labels.instance }}).\"\n description: \"Service: `{{ $labels.service_name }}` Healthcheck: `{{ $labels.service_id }}`.\"\n - alert: ConsulMissingMasterNode\n expr: consul_raft_peers \u003c 3\n for: 0m\n labels:\n severity: critical\n annotations:\n summary: \"Consul missing master node (instance {{ $labels.instance }}).\"\n description: \"Numbers of consul raft peers should be 3, in order to preserve quorum.\"\n - alert: ConsulAgentUnhealthy\n expr: consul_health_node_status{status=\"critical\"} == 1\n for: 0m\n labels:\n severity: critical\n annotations:\n summary: \"Consul agent unhealthy (instance {{ $labels.instance }}).\"\n description: \"A Consul agent is down.\"\n- name: \"Hosts\"\n rules:\n - alert: NodeDown\n expr: up == 0\n for: 0m\n labels:\n severity: critical\n annotations:\n summary: \"Prometheus target missing (instance {{ $labels.instance }}).\"\n description: \"A Prometheus target has disappeared. An exporter might be crashed.\"\n - alert: HostHighCpuLoad\n expr: 100 - (avg by(instance) (rate(node_cpu_seconds_total{mode=\"idle\"}[2m])) * 100) \u003e 80\n for: 0m\n labels:\n severity: warning\n annotations:\n summary: \"Host high CPU load (instance {{ $labels.instance }}).\"\n description: \"CPU load is \u003e 80%.\"\n - alert: HostOutOfMemory\n expr: node_memory_MemAvailable_bytes / node_memory_MemTotal_bytes * 100 \u003c 10\n for: 2m\n labels:\n severity: warning\n annotations:\n summary: \"Host out of memory (instance {{ $labels.instance }}).\"\n description: \"Node memory is filling up (\u003c 10% left).\"\n - alert: HostOomKillDetected\n expr: increase(node_vmstat_oom_kill[1m]) \u003e 0\n for: 0m\n labels:\n severity: warning\n annotations:\n summary: \"Host OOM kill detected (instance {{ $labels.instance }}).\"\n description: \"OOM kill detected.\"\n - alert: HostMemoryUnderMemoryPressure\n expr: rate(node_vmstat_pgmajfault[1m]) \u003e 1000\n for: 2m\n labels:\n severity: warning\n annotations:\n summary: \"Host memory under memory pressure (instance {{ $labels.instance }}).\"\n description: \"The node is under heavy memory pressure. High rate of major page faults.\"\n - alert: HostOutOfDiskSpace\n expr: (node_filesystem_avail_bytes * 100) / node_filesystem_size_bytes \u003c 10 and ON (instance, device, mountpoint) node_filesystem_readonly == 0\n for: 2m\n labels:\n severity: warning\n annotations:\n summary: \"Host out of disk space (instance {{ $labels.instance }}).\"\n description: \"Disk is almost full (\u003c 10% left).\"\n - alert: HostRaidDiskFailure\n expr: node_md_disks{state=\"failed\"} \u003e 0\n for: 2m\n labels:\n severity: warning\n annotations:\n summary: \"Host RAID disk failure (instance {{ $labels.instance }}).\"\n description: \"At least one device in RAID array on {{ $labels.instance }} failed. Array {{ $labels.md_device }} needs attention and possibly a disk swap.\"\n - alert: HostConntrackLimit\n expr: node_nf_conntrack_entries / node_nf_conntrack_entries_limit \u003e 0.8\n for: 5m\n labels:\n severity: warning\n annotations:\n summary: \"Host conntrack limit (instance {{ $labels.instance }}).\"\n description: \"The number of conntrack is approching limit.\"\n - alert: HostNetworkInterfaceSaturated\n expr: (rate(node_network_receive_bytes_total{device!~\"^tap.*\"}[1m]) + rate(node_network_transmit_bytes_total{device!~\"^tap.*\"}[1m])) / node_network_speed_bytes{device!~\"^tap.*\"} \u003e 0.8\n for: 1m\n labels:\n severity: warning\n annotations:\n summary: \"Host Network Interface Saturated (instance {{ $labels.instance }}).\"\n description: \"The network interface {{ $labels.interface }} on {{ $labels.instance }} is getting overloaded.\"\n - alert: HostSystemdServiceCrashed\n expr: node_systemd_unit_state{state=\"failed\"} == 1\n for: 0m\n labels:\n severity: warning\n annotations:\n summary: \"Host SystemD service crashed (instance {{ $labels.instance }}).\"\n description: \"SystemD service crashed.\"\n - alert: HostEdacCorrectableErrorsDetected\n expr: increase(node_edac_correctable_errors_total[1m]) \u003e 0\n for: 0m\n labels:\n severity: info\n annotations:\n summary: \"Host EDAC Correctable Errors detected (instance {{ $labels.instance }}).\"\n description: '{{ $labels.instance }} has had {{ printf \"%.0f\" $value }} correctable memory errors reported by EDAC in the last 5 minutes.'\n - alert: HostEdacUncorrectableErrorsDetected\n expr: node_edac_uncorrectable_errors_total \u003e 0\n for: 0m\n labels:\n severity: warning\n annotations:\n summary: \"Host EDAC Uncorrectable Errors detected (instance {{ $labels.instance }}).\"\n description: '{{ $labels.instance }} has had {{ printf \"%.0f\" $value }} uncorrectable memory errors reported by EDAC in the last 5 minutes.'\n- name: \"Min.io\"\n rules:\n - alert: MinioDiskOffline\n expr: minio_offline_disks \u003e 0\n for: 0m\n labels:\n severity: critical\n annotations:\n summary: \"Minio disk offline (instance {{ $labels.instance }})\"\n description: \"Minio disk is offline.\"\n - alert: MinioStorageSpaceExhausted\n expr: minio_disk_storage_free_bytes / 1024 / 1024 / 1024 \u003c 10\n for: 2m\n labels:\n severity: warning\n annotations:\n summary: \"Minio storage space exhausted (instance {{ $labels.instance }}).\"\n description: \"Minio storage space is low (\u003c 10 GB).\"\n- name: \"Prometheus\"\n rules:\n - alert: PrometheusConfigurationReloadFailure\n expr: prometheus_config_last_reload_successful != 1\n for: 0m\n labels:\n severity: warning\n annotations:\n summary: \"Prometheus configuration reload failure (instance {{ $labels.instance }}).\"\n description: \"Prometheus configuration reload error.\"\n - alert: PrometheusTooManyRestarts\n expr: changes(process_start_time_seconds{job=~\"prometheus|pushgateway|alertmanager\"}[15m]) \u003e 2\n for: 0m\n labels:\n severity: warning\n annotations:\n summary: \"Prometheus too many restarts (instance {{ $labels.instance }}).\"\n description: \"Prometheus has restarted more than twice in the last 15 minutes. It might be crashlooping.\"\n - alert: PrometheusAlertmanagerConfigurationReloadFailure\n expr: alertmanager_config_last_reload_successful != 1\n for: 0m\n labels:\n severity: warning\n annotations:\n summary: \"Prometheus AlertManager configuration reload failure (instance {{ $labels.instance }}).\"\n description: \"AlertManager configuration reload error.\"\n - alert: PrometheusRuleEvaluationFailures\n expr: increase(prometheus_rule_evaluation_failures_total[3m]) \u003e 0\n for: 0m\n labels:\n severity: critical\n annotations:\n summary: \"Prometheus rule evaluation failures (instance {{ $labels.instance }}).\"\n description: \"Prometheus encountered {{ $value }} rule evaluation failures, leading to potentially ignored alerts.\"\n - alert: PrometheusTargetScrapingSlow\n expr: prometheus_target_interval_length_seconds{quantile=\"0.9\"} \u003e 60\n for: 5m\n labels:\n severity: warning\n annotations:\n summary: \"Prometheus target scraping slow (instance {{ $labels.instance }}).\"\n description: \"Prometheus is scraping exporters slowly.\"\n - alert: PrometheusTsdbCompactionsFailed\n expr: increase(prometheus_tsdb_compactions_failed_total[1m]) \u003e 0\n for: 0m\n labels:\n severity: critical\n annotations:\n summary: \"Prometheus TSDB compactions failed (instance {{ $labels.instance }}).\"\n description: \"Prometheus encountered {{ $value }} TSDB compactions failures.\"\n - alert: PrometheusTsdbHeadTruncationsFailed\n expr: increase(prometheus_tsdb_head_truncations_failed_total[1m]) \u003e 0\n for: 0m\n labels:\n severity: critical\n annotations:\n summary: \"Prometheus TSDB head truncations failed (instance {{ $labels.instance }}).\"\n description: \"Prometheus encountered {{ $value }} TSDB head truncation failures.\"\n - alert: PrometheusTsdbWalCorruptions\n expr: increase(prometheus_tsdb_wal_corruptions_total[1m]) \u003e 0\n for: 0m\n labels:\n severity: critical\n annotations:\n summary: \"Prometheus TSDB WAL corruptions (instance {{ $labels.instance }}).\"\n description: \"Prometheus encountered {{ $value }} TSDB WAL corruptions.\"\n - alert: PrometheusTsdbWalTruncationsFailed\n expr: increase(prometheus_tsdb_wal_truncations_failed_total[1m]) \u003e 0\n for: 0m\n labels:\n severity: critical\n annotations:\n summary: \"Prometheus TSDB WAL truncations failed (instance {{ $labels.instance }}).\"\n description: \"Prometheus encountered {{ $value }} TSDB WAL truncation failures.\"\nEOH\n }\n\n template {\n change_mode = \"noop\"\n change_signal = \"SIGINT\"\n destination = \"secrets/prometheus.yml\"\n data = \u003c\u003cEOH\n---\nglobal:\n scrape_interval: 5s\n scrape_timeout: 5s\n evaluation_interval: 5s\n\nalerting:\n alertmanagers:\n - consul_sd_configs:\n - server: '{{ env \"NOMAD_IP_prometheus\" }}:8500'\n services: [ 'alertmanager' ]\n\nrule_files:\n - 'alerts.yml'\n\nscrape_configs:\n\n - job_name: 'Nomad Cluster'\n consul_sd_configs:\n - server: '{{ env \"NOMAD_IP_prometheus\" }}:8500'\n services: [ 'nomad-client', 'nomad' ]\n relabel_configs:\n - source_labels: [__meta_consul_tags]\n regex: '(.*)http(.*)'\n action: keep\n metrics_path: /v1/metrics\n params:\n format: [ 'prometheus' ]\n\n - job_name: 'Consul Cluster'\n static_configs:\n - targets: [ '10.30.51.30:8500', '10.30.51.32:8500', '10.30.51.33:8500' ]\n metrics_path: /v1/agent/metrics\n params:\n format: [ 'prometheus' ]\n\n - job_name: 'Alertmanager'\n consul_sd_configs:\n - server: '{{ env \"NOMAD_IP_prometheus\" }}:8500'\n services: [ 'alertmanager' ]\n\n - job_name: 'Blackbox Exporter (icmp)'\n static_configs:\n - targets: [ 'gerrit.fd.io' ]\n - targets: [ 'jenkins.fd.io' ]\n - targets: [ '10.30.51.32' ]\n params:\n module: [ 'icmp_v4' ]\n relabel_configs:\n - source_labels: [__address__]\n target_label: __param_target\n - source_labels: [__param_target]\n target_label: instance\n - target_label: __address__\n replacement: localhost:9115\n metrics_path: /probe\n\n - job_name: 'Blackbox Exporter (http)'\n static_configs:\n - targets: [ 'gerrit.fd.io' ]\n - targets: [ 'jenkins.fd.io' ]\n params:\n module: [ 'http_2xx' ]\n relabel_configs:\n - source_labels: [__address__]\n target_label: __param_target\n - source_labels: [__param_target]\n target_label: instance\n - target_label: __address__\n replacement: localhost:9115\n metrics_path: /probe\n\n - job_name: 'cAdvisor Exporter'\n consul_sd_configs:\n - server: '{{ env \"NOMAD_IP_prometheus\" }}:8500'\n services: [ 'cadvisorexporter' ]\n\n - job_name: 'Grafana'\n consul_sd_configs:\n - server: '{{ env \"NOMAD_IP_prometheus\" }}:8500'\n services: [ 'grafana' ]\n\n - job_name: 'Node Exporter'\n consul_sd_configs:\n - server: '{{ env \"NOMAD_IP_prometheus\" }}:8500'\n services: [ 'nodeexporter' ]\n\n - job_name: 'Prometheus'\n consul_sd_configs:\n - server: '{{ env \"NOMAD_IP_prometheus\" }}:8500'\n services: [ 'prometheus' ]\n\n - job_name: 'Minio'\n bearer_token: eyJhbGciOiJIUzUxMiIsInR5cCI6IkpXVCJ9.eyJleHAiOjQ3NjQ1ODEzMzcsImlzcyI6InByb21ldGhldXMiLCJzdWIiOiJtaW5pbyJ9.oeTw3EIaiFmlDikrHXWiWXMH2vxLfDLkfjEC7G2N3M_keH_xyA_l2ofLLNYtopa_3GCEZnxLQdPuFZrmgpkDWg\n consul_sd_configs:\n - server: '{{ env \"NOMAD_IP_prometheus\" }}:8500'\n services: [ 'storage' ]\n metrics_path: /minio/prometheus/metrics\nEOH\n }\n\n # The service stanza instructs Nomad to register a service with Consul.\n #\n # For more information and examples on the \"task\" stanza, please see\n # the online documentation at:\n #\n # https://www.nomadproject.io/docs/job-specification/service\n #\n service {\n name = \"prometheus\"\n port = \"prometheus\"\n tags = [ \"prometheus${NOMAD_ALLOC_INDEX}\" ]\n check {\n name = \"Prometheus Check Live\"\n type = \"http\"\n path = \"/-/healthy\"\n interval = \"10s\"\n timeout = \"2s\"\n }\n }\n\n # The \"resources\" stanza describes the requirements a task needs to\n # execute. Resource requirements include memory, network, cpu, and more.\n # This ensures the task will execute on a machine that contains enough\n # resource capacity.\n #\n # For more information and examples on the \"resources\" stanza, please see\n # the online documentation at:\n #\n # https://www.nomadproject.io/docs/job-specification/resources\n #\n resources {\n cpu = 2000\n memory = 8192\n # The network stanza specifies the networking requirements for the task\n # group, including the network mode and port allocations. When scheduling\n # jobs in Nomad they are provisioned across your fleet of machines along\n # with other jobs and services. Because you don't know in advance what host\n # your job will be provisioned on, Nomad will provide your tasks with\n # network configuration when they start up.\n #\n # For more information and examples on the \"template\" stanza, please see\n # the online documentation at:\n #\n # https://www.nomadproject.io/docs/job-specification/network\n #\n network {\n port \"prometheus\" {\n static = 9090\n }\n }\n }\n }\n }\n}", - "template": "job \"${job_name}\" {\n # The \"region\" parameter specifies the region in which to execute the job.\n # If omitted, this inherits the default region name of \"global\".\n # region = \"global\"\n #\n # The \"datacenters\" parameter specifies the list of datacenters which should\n # be considered when placing this task. This must be provided.\n datacenters = \"${datacenters}\"\n\n # The \"type\" parameter controls the type of job, which impacts the scheduler's\n # decision on placement. This configuration is optional and defaults to\n # \"service\". For a full list of job types and their differences, please see\n # the online documentation.\n #\n # For more information, please see the online documentation at:\n #\n # https://www.nomadproject.io/docs/jobspec/schedulers\n #\n type = \"service\"\n\n update {\n # The \"max_parallel\" parameter specifies the maximum number of updates to\n # perform in parallel. In this case, this specifies to update a single task\n # at a time.\n max_parallel = 1\n\n health_check = \"checks\"\n\n # The \"min_healthy_time\" parameter specifies the minimum time the allocation\n # must be in the healthy state before it is marked as healthy and unblocks\n # further allocations from being updated.\n min_healthy_time = \"10s\"\n\n # The \"healthy_deadline\" parameter specifies the deadline in which the\n # allocation must be marked as healthy after which the allocation is\n # automatically transitioned to unhealthy. Transitioning to unhealthy will\n # fail the deployment and potentially roll back the job if \"auto_revert\" is\n # set to true.\n healthy_deadline = \"3m\"\n\n # The \"progress_deadline\" parameter specifies the deadline in which an\n # allocation must be marked as healthy. The deadline begins when the first\n # allocation for the deployment is created and is reset whenever an allocation\n # as part of the deployment transitions to a healthy state. If no allocation\n # transitions to the healthy state before the progress deadline, the\n # deployment is marked as failed.\n progress_deadline = \"10m\"\n\n%{ if use_canary }\n # The \"canary\" parameter specifies that changes to the job that would result\n # in destructive updates should create the specified number of canaries\n # without stopping any previous allocations. Once the operator determines the\n # canaries are healthy, they can be promoted which unblocks a rolling update\n # of the remaining allocations at a rate of \"max_parallel\".\n #\n # Further, setting \"canary\" equal to the count of the task group allows\n # blue/green deployments. When the job is updated, a full set of the new\n # version is deployed and upon promotion the old version is stopped.\n canary = 1\n\n # Specifies if the job should auto-promote to the canary version when all\n # canaries become healthy during a deployment. Defaults to false which means\n # canaries must be manually updated with the nomad deployment promote\n # command.\n auto_promote = true\n\n # The \"auto_revert\" parameter specifies if the job should auto-revert to the\n # last stable job on deployment failure. A job is marked as stable if all the\n # allocations as part of its deployment were marked healthy.\n auto_revert = true\n%{ endif }\n }\n\n # The \"group\" stanza defines a series of tasks that should be co-located on\n # the same Nomad client. Any task within a group will be placed on the same\n # client.\n #\n # For more information and examples on the \"group\" stanza, please see\n # the online documentation at:\n #\n # https://www.nomadproject.io/docs/job-specification/group\n #\n group \"prod-group1-${service_name}\" {\n # The \"count\" parameter specifies the number of the task groups that should\n # be running under this group. This value must be non-negative and defaults\n # to 1.\n count = ${group_count}\n\n # The volume stanza allows the group to specify that it requires a given\n # volume from the cluster.\n #\n # For more information and examples on the \"volume\" stanza, please see\n # the online documentation at:\n #\n # https://www.nomadproject.io/docs/job-specification/volume\n #\n %{ if use_host_volume }\n volume \"prod-volume1-${service_name}\" {\n type = \"host\"\n read_only = false\n source = \"${host_volume}\"\n }\n %{ endif }\n\n # The constraint allows restricting the set of eligible nodes. Constraints\n # may filter on attributes or client metadata.\n #\n # For more information and examples on the \"volume\" stanza, please see\n # the online documentation at:\n #\n # https://www.nomadproject.io/docs/job-specification/constraint\n #\n constraint {\n attribute = \"$${attr.cpu.arch}\"\n operator = \"!=\"\n value = \"arm64\"\n }\n\n # The \"task\" stanza creates an individual unit of work, such as a Docker\n # container, web application, or batch processing.\n #\n # For more information and examples on the \"task\" stanza, please see\n # the online documentation at:\n #\n # https://www.nomadproject.io/docs/job-specification/task\n #\n task \"prod-task1-${service_name}\" {\n # The \"driver\" parameter specifies the task driver that should be used to\n # run the task.\n driver = \"exec\"\n\n %{ if use_host_volume }\n volume_mount {\n volume = \"prod-volume1-${service_name}\"\n destination = \"${data_dir}\"\n read_only = false\n }\n %{ endif }\n\n %{ if use_vault_provider }\n vault {\n policies = \"${vault_kv_policy_name}\"\n }\n %{ endif }\n\n # The \"config\" stanza specifies the driver configuration, which is passed\n # directly to the driver to start the task. The details of configurations\n # are specific to each driver, so please see specific driver\n # documentation for more information.\n config {\n command = \"local/prometheus-${version}.linux-amd64/prometheus\"\n args = [\n \"--config.file=secrets/prometheus.yml\",\n \"--storage.tsdb.path=${data_dir}prometheus/\",\n \"--storage.tsdb.retention.time=15d\"\n ]\n }\n\n # The artifact stanza instructs Nomad to fetch and unpack a remote resource,\n # such as a file, tarball, or binary. Nomad downloads artifacts using the\n # popular go-getter library, which permits downloading artifacts from a\n # variety of locations using a URL as the input source.\n #\n # For more information and examples on the \"artifact\" stanza, please see\n # the online documentation at:\n #\n # https://www.nomadproject.io/docs/job-specification/artifact\n #\n artifact {\n source = \"${url}\"\n }\n\n # The \"template\" stanza instructs Nomad to manage a template, such as\n # a configuration file or script. This template can optionally pull data\n # from Consul or Vault to populate runtime configuration data.\n #\n # For more information and examples on the \"template\" stanza, please see\n # the online documentation at:\n #\n # https://www.nomadproject.io/docs/job-specification/template\n #\n template {\n change_mode = \"noop\"\n change_signal = \"SIGINT\"\n destination = \"secrets/alerts.yml\"\n left_delimiter = \"{{{\"\n right_delimiter = \"}}}\"\n data = \u003c\u003cEOH\n---\ngroups:\n- name: \"Consul\"\n rules:\n - alert: ConsulServiceHealthcheckFailed\n expr: consul_catalog_service_node_healthy == 0\n for: 0m\n labels:\n severity: critical\n annotations:\n summary: \"Consul service healthcheck failed (instance {{ $labels.instance }}).\"\n description: \"Service: `{{ $labels.service_name }}` Healthcheck: `{{ $labels.service_id }}`.\"\n - alert: ConsulMissingMasterNode\n expr: consul_raft_peers \u003c 3\n for: 0m\n labels:\n severity: critical\n annotations:\n summary: \"Consul missing master node (instance {{ $labels.instance }}).\"\n description: \"Numbers of consul raft peers should be 3, in order to preserve quorum.\"\n - alert: ConsulAgentUnhealthy\n expr: consul_health_node_status{status=\"critical\"} == 1\n for: 0m\n labels:\n severity: critical\n annotations:\n summary: \"Consul agent unhealthy (instance {{ $labels.instance }}).\"\n description: \"A Consul agent is down.\"\n- name: \"Hosts\"\n rules:\n - alert: NodeDown\n expr: up == 0\n for: 0m\n labels:\n severity: critical\n annotations:\n summary: \"Prometheus target missing (instance {{ $labels.instance }}).\"\n description: \"A Prometheus target has disappeared. An exporter might be crashed.\"\n - alert: HostHighCpuLoad\n expr: 100 - (avg by(instance) (rate(node_cpu_seconds_total{mode=\"idle\"}[2m])) * 100) \u003e 80\n for: 0m\n labels:\n severity: warning\n annotations:\n summary: \"Host high CPU load (instance {{ $labels.instance }}).\"\n description: \"CPU load is \u003e 80%.\"\n - alert: HostOutOfMemory\n expr: node_memory_MemAvailable_bytes / node_memory_MemTotal_bytes * 100 \u003c 10\n for: 2m\n labels:\n severity: warning\n annotations:\n summary: \"Host out of memory (instance {{ $labels.instance }}).\"\n description: \"Node memory is filling up (\u003c 10% left).\"\n - alert: HostOomKillDetected\n expr: increase(node_vmstat_oom_kill[1m]) \u003e 0\n for: 0m\n labels:\n severity: warning\n annotations:\n summary: \"Host OOM kill detected (instance {{ $labels.instance }}).\"\n description: \"OOM kill detected.\"\n - alert: HostMemoryUnderMemoryPressure\n expr: rate(node_vmstat_pgmajfault[1m]) \u003e 1000\n for: 2m\n labels:\n severity: warning\n annotations:\n summary: \"Host memory under memory pressure (instance {{ $labels.instance }}).\"\n description: \"The node is under heavy memory pressure. High rate of major page faults.\"\n - alert: HostOutOfDiskSpace\n expr: (node_filesystem_avail_bytes * 100) / node_filesystem_size_bytes \u003c 10 and ON (instance, device, mountpoint) node_filesystem_readonly == 0\n for: 2m\n labels:\n severity: warning\n annotations:\n summary: \"Host out of disk space (instance {{ $labels.instance }}).\"\n description: \"Disk is almost full (\u003c 10% left).\"\n - alert: HostRaidDiskFailure\n expr: node_md_disks{state=\"failed\"} \u003e 0\n for: 2m\n labels:\n severity: warning\n annotations:\n summary: \"Host RAID disk failure (instance {{ $labels.instance }}).\"\n description: \"At least one device in RAID array on {{ $labels.instance }} failed. Array {{ $labels.md_device }} needs attention and possibly a disk swap.\"\n - alert: HostConntrackLimit\n expr: node_nf_conntrack_entries / node_nf_conntrack_entries_limit \u003e 0.8\n for: 5m\n labels:\n severity: warning\n annotations:\n summary: \"Host conntrack limit (instance {{ $labels.instance }}).\"\n description: \"The number of conntrack is approching limit.\"\n - alert: HostNetworkInterfaceSaturated\n expr: (rate(node_network_receive_bytes_total{device!~\"^tap.*\"}[1m]) + rate(node_network_transmit_bytes_total{device!~\"^tap.*\"}[1m])) / node_network_speed_bytes{device!~\"^tap.*\"} \u003e 0.8\n for: 1m\n labels:\n severity: warning\n annotations:\n summary: \"Host Network Interface Saturated (instance {{ $labels.instance }}).\"\n description: \"The network interface {{ $labels.interface }} on {{ $labels.instance }} is getting overloaded.\"\n - alert: HostSystemdServiceCrashed\n expr: node_systemd_unit_state{state=\"failed\"} == 1\n for: 0m\n labels:\n severity: warning\n annotations:\n summary: \"Host SystemD service crashed (instance {{ $labels.instance }}).\"\n description: \"SystemD service crashed.\"\n - alert: HostEdacCorrectableErrorsDetected\n expr: increase(node_edac_correctable_errors_total[1m]) \u003e 0\n for: 0m\n labels:\n severity: info\n annotations:\n summary: \"Host EDAC Correctable Errors detected (instance {{ $labels.instance }}).\"\n description: '{{ $labels.instance }} has had {{ printf \"%.0f\" $value }} correctable memory errors reported by EDAC in the last 5 minutes.'\n - alert: HostEdacUncorrectableErrorsDetected\n expr: node_edac_uncorrectable_errors_total \u003e 0\n for: 0m\n labels:\n severity: warning\n annotations:\n summary: \"Host EDAC Uncorrectable Errors detected (instance {{ $labels.instance }}).\"\n description: '{{ $labels.instance }} has had {{ printf \"%.0f\" $value }} uncorrectable memory errors reported by EDAC in the last 5 minutes.'\n- name: \"Min.io\"\n rules:\n - alert: MinioDiskOffline\n expr: minio_offline_disks \u003e 0\n for: 0m\n labels:\n severity: critical\n annotations:\n summary: \"Minio disk offline (instance {{ $labels.instance }})\"\n description: \"Minio disk is offline.\"\n - alert: MinioStorageSpaceExhausted\n expr: minio_disk_storage_free_bytes / 1024 / 1024 / 1024 \u003c 10\n for: 2m\n labels:\n severity: warning\n annotations:\n summary: \"Minio storage space exhausted (instance {{ $labels.instance }}).\"\n description: \"Minio storage space is low (\u003c 10 GB).\"\n- name: \"Prometheus\"\n rules:\n - alert: PrometheusConfigurationReloadFailure\n expr: prometheus_config_last_reload_successful != 1\n for: 0m\n labels:\n severity: warning\n annotations:\n summary: \"Prometheus configuration reload failure (instance {{ $labels.instance }}).\"\n description: \"Prometheus configuration reload error.\"\n - alert: PrometheusTooManyRestarts\n expr: changes(process_start_time_seconds{job=~\"prometheus|pushgateway|alertmanager\"}[15m]) \u003e 2\n for: 0m\n labels:\n severity: warning\n annotations:\n summary: \"Prometheus too many restarts (instance {{ $labels.instance }}).\"\n description: \"Prometheus has restarted more than twice in the last 15 minutes. It might be crashlooping.\"\n - alert: PrometheusAlertmanagerConfigurationReloadFailure\n expr: alertmanager_config_last_reload_successful != 1\n for: 0m\n labels:\n severity: warning\n annotations:\n summary: \"Prometheus AlertManager configuration reload failure (instance {{ $labels.instance }}).\"\n description: \"AlertManager configuration reload error.\"\n - alert: PrometheusRuleEvaluationFailures\n expr: increase(prometheus_rule_evaluation_failures_total[3m]) \u003e 0\n for: 0m\n labels:\n severity: critical\n annotations:\n summary: \"Prometheus rule evaluation failures (instance {{ $labels.instance }}).\"\n description: \"Prometheus encountered {{ $value }} rule evaluation failures, leading to potentially ignored alerts.\"\n - alert: PrometheusTargetScrapingSlow\n expr: prometheus_target_interval_length_seconds{quantile=\"0.9\"} \u003e 60\n for: 5m\n labels:\n severity: warning\n annotations:\n summary: \"Prometheus target scraping slow (instance {{ $labels.instance }}).\"\n description: \"Prometheus is scraping exporters slowly.\"\n - alert: PrometheusTsdbCompactionsFailed\n expr: increase(prometheus_tsdb_compactions_failed_total[1m]) \u003e 0\n for: 0m\n labels:\n severity: critical\n annotations:\n summary: \"Prometheus TSDB compactions failed (instance {{ $labels.instance }}).\"\n description: \"Prometheus encountered {{ $value }} TSDB compactions failures.\"\n - alert: PrometheusTsdbHeadTruncationsFailed\n expr: increase(prometheus_tsdb_head_truncations_failed_total[1m]) \u003e 0\n for: 0m\n labels:\n severity: critical\n annotations:\n summary: \"Prometheus TSDB head truncations failed (instance {{ $labels.instance }}).\"\n description: \"Prometheus encountered {{ $value }} TSDB head truncation failures.\"\n - alert: PrometheusTsdbWalCorruptions\n expr: increase(prometheus_tsdb_wal_corruptions_total[1m]) \u003e 0\n for: 0m\n labels:\n severity: critical\n annotations:\n summary: \"Prometheus TSDB WAL corruptions (instance {{ $labels.instance }}).\"\n description: \"Prometheus encountered {{ $value }} TSDB WAL corruptions.\"\n - alert: PrometheusTsdbWalTruncationsFailed\n expr: increase(prometheus_tsdb_wal_truncations_failed_total[1m]) \u003e 0\n for: 0m\n labels:\n severity: critical\n annotations:\n summary: \"Prometheus TSDB WAL truncations failed (instance {{ $labels.instance }}).\"\n description: \"Prometheus encountered {{ $value }} TSDB WAL truncation failures.\"\nEOH\n }\n\n template {\n change_mode = \"noop\"\n change_signal = \"SIGINT\"\n destination = \"secrets/prometheus.yml\"\n data = \u003c\u003cEOH\n---\nglobal:\n scrape_interval: 5s\n scrape_timeout: 5s\n evaluation_interval: 5s\n\nalerting:\n alertmanagers:\n - consul_sd_configs:\n - server: '{{ env \"NOMAD_IP_prometheus\" }}:8500'\n services: [ 'alertmanager' ]\n\nrule_files:\n - 'alerts.yml'\n\nscrape_configs:\n\n - job_name: 'Nomad Cluster'\n consul_sd_configs:\n - server: '{{ env \"NOMAD_IP_prometheus\" }}:8500'\n services: [ 'nomad-client', 'nomad' ]\n relabel_configs:\n - source_labels: [__meta_consul_tags]\n regex: '(.*)http(.*)'\n action: keep\n metrics_path: /v1/metrics\n params:\n format: [ 'prometheus' ]\n\n - job_name: 'Consul Cluster'\n static_configs:\n - targets: [ '10.30.51.30:8500', '10.30.51.32:8500', '10.30.51.33:8500' ]\n metrics_path: /v1/agent/metrics\n params:\n format: [ 'prometheus' ]\n\n - job_name: 'Alertmanager'\n consul_sd_configs:\n - server: '{{ env \"NOMAD_IP_prometheus\" }}:8500'\n services: [ 'alertmanager' ]\n\n - job_name: 'Blackbox Exporter (icmp)'\n static_configs:\n - targets: [ 'gerrit.fd.io' ]\n - targets: [ 'jenkins.fd.io' ]\n - targets: [ '10.30.51.32' ]\n params:\n module: [ 'icmp_v4' ]\n relabel_configs:\n - source_labels: [__address__]\n target_label: __param_target\n - source_labels: [__param_target]\n target_label: instance\n - target_label: __address__\n replacement: localhost:9115\n metrics_path: /probe\n\n - job_name: 'Blackbox Exporter (http)'\n static_configs:\n - targets: [ 'gerrit.fd.io' ]\n - targets: [ 'jenkins.fd.io' ]\n params:\n module: [ 'http_2xx' ]\n relabel_configs:\n - source_labels: [__address__]\n target_label: __param_target\n - source_labels: [__param_target]\n target_label: instance\n - target_label: __address__\n replacement: localhost:9115\n metrics_path: /probe\n\n - job_name: 'cAdvisor Exporter'\n consul_sd_configs:\n - server: '{{ env \"NOMAD_IP_prometheus\" }}:8500'\n services: [ 'cadvisorexporter' ]\n\n - job_name: 'Grafana'\n consul_sd_configs:\n - server: '{{ env \"NOMAD_IP_prometheus\" }}:8500'\n services: [ 'grafana' ]\n\n - job_name: 'Node Exporter'\n consul_sd_configs:\n - server: '{{ env \"NOMAD_IP_prometheus\" }}:8500'\n services: [ 'nodeexporter' ]\n\n - job_name: 'Prometheus'\n consul_sd_configs:\n - server: '{{ env \"NOMAD_IP_prometheus\" }}:8500'\n services: [ 'prometheus' ]\n\n - job_name: 'Minio'\n bearer_token: eyJhbGciOiJIUzUxMiIsInR5cCI6IkpXVCJ9.eyJleHAiOjQ3NjQ1ODEzMzcsImlzcyI6InByb21ldGhldXMiLCJzdWIiOiJtaW5pbyJ9.oeTw3EIaiFmlDikrHXWiWXMH2vxLfDLkfjEC7G2N3M_keH_xyA_l2ofLLNYtopa_3GCEZnxLQdPuFZrmgpkDWg\n consul_sd_configs:\n - server: '{{ env \"NOMAD_IP_prometheus\" }}:8500'\n services: [ 'storage' ]\n metrics_path: /minio/prometheus/metrics\nEOH\n }\n\n # The service stanza instructs Nomad to register a service with Consul.\n #\n # For more information and examples on the \"task\" stanza, please see\n # the online documentation at:\n #\n # https://www.nomadproject.io/docs/job-specification/service\n #\n service {\n name = \"${service_name}\"\n port = \"${service_name}\"\n tags = [ \"${service_name}$${NOMAD_ALLOC_INDEX}\" ]\n check {\n name = \"Prometheus Check Live\"\n type = \"http\"\n path = \"/-/healthy\"\n interval = \"10s\"\n timeout = \"2s\"\n }\n }\n\n # The \"resources\" stanza describes the requirements a task needs to\n # execute. Resource requirements include memory, network, cpu, and more.\n # This ensures the task will execute on a machine that contains enough\n # resource capacity.\n #\n # For more information and examples on the \"resources\" stanza, please see\n # the online documentation at:\n #\n # https://www.nomadproject.io/docs/job-specification/resources\n #\n resources {\n cpu = ${cpu}\n memory = ${mem}\n # The network stanza specifies the networking requirements for the task\n # group, including the network mode and port allocations. When scheduling\n # jobs in Nomad they are provisioned across your fleet of machines along\n # with other jobs and services. Because you don't know in advance what host\n # your job will be provisioned on, Nomad will provide your tasks with\n # network configuration when they start up.\n #\n # For more information and examples on the \"template\" stanza, please see\n # the online documentation at:\n #\n # https://www.nomadproject.io/docs/job-specification/network\n #\n network {\n port \"${service_name}\" {\n static = ${port}\n }\n }\n }\n }\n }\n}", + "id": "f5c5b3316512492fa4e1caea185bafeeeb51b619e76b08ab473ca9523c2b8268", + "rendered": "job \"prod-prometheus\" {\n # The \"region\" parameter specifies the region in which to execute the job.\n # If omitted, this inherits the default region name of \"global\".\n # region = \"global\"\n #\n # The \"datacenters\" parameter specifies the list of datacenters which should\n # be considered when placing this task. This must be provided.\n datacenters = \"yul1\"\n\n # The \"type\" parameter controls the type of job, which impacts the scheduler's\n # decision on placement. This configuration is optional and defaults to\n # \"service\". For a full list of job types and their differences, please see\n # the online documentation.\n #\n # For more information, please see the online documentation at:\n #\n # https://www.nomadproject.io/docs/jobspec/schedulers\n #\n type = \"service\"\n\n update {\n # The \"max_parallel\" parameter specifies the maximum number of updates to\n # perform in parallel. In this case, this specifies to update a single task\n # at a time.\n max_parallel = 1\n\n health_check = \"checks\"\n\n # The \"min_healthy_time\" parameter specifies the minimum time the allocation\n # must be in the healthy state before it is marked as healthy and unblocks\n # further allocations from being updated.\n min_healthy_time = \"10s\"\n\n # The \"healthy_deadline\" parameter specifies the deadline in which the\n # allocation must be marked as healthy after which the allocation is\n # automatically transitioned to unhealthy. Transitioning to unhealthy will\n # fail the deployment and potentially roll back the job if \"auto_revert\" is\n # set to true.\n healthy_deadline = \"3m\"\n\n # The \"progress_deadline\" parameter specifies the deadline in which an\n # allocation must be marked as healthy. The deadline begins when the first\n # allocation for the deployment is created and is reset whenever an allocation\n # as part of the deployment transitions to a healthy state. If no allocation\n # transitions to the healthy state before the progress deadline, the\n # deployment is marked as failed.\n progress_deadline = \"10m\"\n\n\n # The \"canary\" parameter specifies that changes to the job that would result\n # in destructive updates should create the specified number of canaries\n # without stopping any previous allocations. Once the operator determines the\n # canaries are healthy, they can be promoted which unblocks a rolling update\n # of the remaining allocations at a rate of \"max_parallel\".\n #\n # Further, setting \"canary\" equal to the count of the task group allows\n # blue/green deployments. When the job is updated, a full set of the new\n # version is deployed and upon promotion the old version is stopped.\n canary = 1\n\n # Specifies if the job should auto-promote to the canary version when all\n # canaries become healthy during a deployment. Defaults to false which means\n # canaries must be manually updated with the nomad deployment promote\n # command.\n auto_promote = true\n\n # The \"auto_revert\" parameter specifies if the job should auto-revert to the\n # last stable job on deployment failure. A job is marked as stable if all the\n # allocations as part of its deployment were marked healthy.\n auto_revert = true\n\n }\n\n # The \"group\" stanza defines a series of tasks that should be co-located on\n # the same Nomad client. Any task within a group will be placed on the same\n # client.\n #\n # For more information and examples on the \"group\" stanza, please see\n # the online documentation at:\n #\n # https://www.nomadproject.io/docs/job-specification/group\n #\n group \"prod-group1-prometheus\" {\n # The \"count\" parameter specifies the number of the task groups that should\n # be running under this group. This value must be non-negative and defaults\n # to 1.\n count = 4\n\n # The volume stanza allows the group to specify that it requires a given\n # volume from the cluster.\n #\n # For more information and examples on the \"volume\" stanza, please see\n # the online documentation at:\n #\n # https://www.nomadproject.io/docs/job-specification/volume\n #\n \n volume \"prod-volume1-prometheus\" {\n type = \"host\"\n read_only = false\n source = \"prod-volume-data1-1\"\n }\n \n\n # The constraint allows restricting the set of eligible nodes. Constraints\n # may filter on attributes or client metadata.\n #\n # For more information and examples on the \"volume\" stanza, please see\n # the online documentation at:\n #\n # https://www.nomadproject.io/docs/job-specification/constraint\n #\n constraint {\n attribute = \"${attr.cpu.arch}\"\n operator = \"!=\"\n value = \"arm64\"\n }\n\n # The \"task\" stanza creates an individual unit of work, such as a Docker\n # container, web application, or batch processing.\n #\n # For more information and examples on the \"task\" stanza, please see\n # the online documentation at:\n #\n # https://www.nomadproject.io/docs/job-specification/task\n #\n task \"prod-task1-prometheus\" {\n # The \"driver\" parameter specifies the task driver that should be used to\n # run the task.\n driver = \"exec\"\n\n \n volume_mount {\n volume = \"prod-volume1-prometheus\"\n destination = \"/data/\"\n read_only = false\n }\n \n\n \n\n # The \"config\" stanza specifies the driver configuration, which is passed\n # directly to the driver to start the task. The details of configurations\n # are specific to each driver, so please see specific driver\n # documentation for more information.\n config {\n command = \"local/prometheus-2.24.0.linux-amd64/prometheus\"\n args = [\n \"--config.file=secrets/prometheus.yml\",\n \"--storage.tsdb.path=/data/prometheus/\",\n \"--storage.tsdb.retention.time=15d\"\n ]\n }\n\n # The artifact stanza instructs Nomad to fetch and unpack a remote resource,\n # such as a file, tarball, or binary. Nomad downloads artifacts using the\n # popular go-getter library, which permits downloading artifacts from a\n # variety of locations using a URL as the input source.\n #\n # For more information and examples on the \"artifact\" stanza, please see\n # the online documentation at:\n #\n # https://www.nomadproject.io/docs/job-specification/artifact\n #\n artifact {\n source = \"https://github.com/prometheus/prometheus/releases/download/v2.24.0/prometheus-2.24.0.linux-amd64.tar.gz\"\n }\n\n # The \"template\" stanza instructs Nomad to manage a template, such as\n # a configuration file or script. This template can optionally pull data\n # from Consul or Vault to populate runtime configuration data.\n #\n # For more information and examples on the \"template\" stanza, please see\n # the online documentation at:\n #\n # https://www.nomadproject.io/docs/job-specification/template\n #\n template {\n change_mode = \"noop\"\n change_signal = \"SIGINT\"\n destination = \"secrets/alerts.yml\"\n left_delimiter = \"{{{\"\n right_delimiter = \"}}}\"\n data = \u003c\u003cEOH\n---\ngroups:\n- name: \"Consul\"\n rules:\n - alert: ConsulServiceHealthcheckFailed\n expr: consul_catalog_service_node_healthy == 0\n for: 0m\n labels:\n severity: critical\n annotations:\n summary: \"Consul service healthcheck failed (instance {{ $labels.instance }}).\"\n description: \"Service: `{{ $labels.service_name }}` Healthcheck: `{{ $labels.service_id }}`.\"\n - alert: ConsulMissingMasterNode\n expr: consul_raft_peers \u003c 3\n for: 0m\n labels:\n severity: critical\n annotations:\n summary: \"Consul missing master node (instance {{ $labels.instance }}).\"\n description: \"Numbers of consul raft peers should be 3, in order to preserve quorum.\"\n - alert: ConsulAgentUnhealthy\n expr: consul_health_node_status{status=\"critical\"} == 1\n for: 0m\n labels:\n severity: critical\n annotations:\n summary: \"Consul agent unhealthy (instance {{ $labels.instance }}).\"\n description: \"A Consul agent is down.\"\n- name: \"Hosts\"\n rules:\n - alert: NodeDown\n expr: up == 0\n for: 0m\n labels:\n severity: critical\n annotations:\n summary: \"Prometheus target missing (instance {{ $labels.instance }}).\"\n description: \"A Prometheus target has disappeared. An exporter might be crashed.\"\n - alert: HostHighCpuLoad\n expr: 100 - (avg by(instance) (rate(node_cpu_seconds_total{mode=\"idle\"}[5m])) * 100) \u003e 95\n for: 0m\n labels:\n severity: warning\n annotations:\n summary: \"Host high CPU load (instance {{ $labels.instance }}).\"\n description: \"CPU load is \u003e 95%.\"\n - alert: HostOutOfMemory\n expr: node_memory_MemAvailable_bytes / node_memory_MemTotal_bytes * 100 \u003c 10\n for: 2m\n labels:\n severity: warning\n annotations:\n summary: \"Host out of memory (instance {{ $labels.instance }}).\"\n description: \"Node memory is filling up (\u003c 10% left).\"\n - alert: HostOomKillDetected\n expr: increase(node_vmstat_oom_kill[1m]) \u003e 0\n for: 0m\n labels:\n severity: warning\n annotations:\n summary: \"Host OOM kill detected (instance {{ $labels.instance }}).\"\n description: \"OOM kill detected.\"\n - alert: HostMemoryUnderMemoryPressure\n expr: rate(node_vmstat_pgmajfault[1m]) \u003e 1000\n for: 2m\n labels:\n severity: warning\n annotations:\n summary: \"Host memory under memory pressure (instance {{ $labels.instance }}).\"\n description: \"The node is under heavy memory pressure. High rate of major page faults.\"\n - alert: HostOutOfDiskSpace\n expr: (node_filesystem_avail_bytes * 100) / node_filesystem_size_bytes \u003c 10 and ON (instance, device, mountpoint) node_filesystem_readonly == 0\n for: 2m\n labels:\n severity: warning\n annotations:\n summary: \"Host out of disk space (instance {{ $labels.instance }}).\"\n description: \"Disk is almost full (\u003c 10% left).\"\n - alert: HostRaidDiskFailure\n expr: node_md_disks{state=\"failed\"} \u003e 0\n for: 2m\n labels:\n severity: warning\n annotations:\n summary: \"Host RAID disk failure (instance {{ $labels.instance }}).\"\n description: \"At least one device in RAID array on {{ $labels.instance }} failed. Array {{ $labels.md_device }} needs attention and possibly a disk swap.\"\n - alert: HostConntrackLimit\n expr: node_nf_conntrack_entries / node_nf_conntrack_entries_limit \u003e 0.8\n for: 5m\n labels:\n severity: warning\n annotations:\n summary: \"Host conntrack limit (instance {{ $labels.instance }}).\"\n description: \"The number of conntrack is approching limit.\"\n - alert: HostNetworkInterfaceSaturated\n expr: (rate(node_network_receive_bytes_total{device!~\"^tap.*\"}[1m]) + rate(node_network_transmit_bytes_total{device!~\"^tap.*\"}[1m])) / node_network_speed_bytes{device!~\"^tap.*\"} \u003e 0.8\n for: 1m\n labels:\n severity: warning\n annotations:\n summary: \"Host Network Interface Saturated (instance {{ $labels.instance }}).\"\n description: \"The network interface {{ $labels.interface }} on {{ $labels.instance }} is getting overloaded.\"\n - alert: HostSystemdServiceCrashed\n expr: node_systemd_unit_state{state=\"failed\"} == 1\n for: 0m\n labels:\n severity: warning\n annotations:\n summary: \"Host SystemD service crashed (instance {{ $labels.instance }}).\"\n description: \"SystemD service crashed.\"\n - alert: HostEdacCorrectableErrorsDetected\n expr: increase(node_edac_correctable_errors_total[1m]) \u003e 0\n for: 0m\n labels:\n severity: info\n annotations:\n summary: \"Host EDAC Correctable Errors detected (instance {{ $labels.instance }}).\"\n description: '{{ $labels.instance }} has had {{ printf \"%.0f\" $value }} correctable memory errors reported by EDAC in the last 5 minutes.'\n - alert: HostEdacUncorrectableErrorsDetected\n expr: node_edac_uncorrectable_errors_total \u003e 0\n for: 0m\n labels:\n severity: warning\n annotations:\n summary: \"Host EDAC Uncorrectable Errors detected (instance {{ $labels.instance }}).\"\n description: '{{ $labels.instance }} has had {{ printf \"%.0f\" $value }} uncorrectable memory errors reported by EDAC in the last 5 minutes.'\n- name: \"Min.io\"\n rules:\n - alert: MinioDiskOffline\n expr: minio_offline_disks \u003e 0\n for: 0m\n labels:\n severity: critical\n annotations:\n summary: \"Minio disk offline (instance {{ $labels.instance }})\"\n description: \"Minio disk is offline.\"\n - alert: MinioStorageSpaceExhausted\n expr: minio_disk_storage_free_bytes / 1024 / 1024 / 1024 \u003c 10\n for: 2m\n labels:\n severity: warning\n annotations:\n summary: \"Minio storage space exhausted (instance {{ $labels.instance }}).\"\n description: \"Minio storage space is low (\u003c 10 GB).\"\n- name: \"Prometheus\"\n rules:\n - alert: PrometheusConfigurationReloadFailure\n expr: prometheus_config_last_reload_successful != 1\n for: 0m\n labels:\n severity: warning\n annotations:\n summary: \"Prometheus configuration reload failure (instance {{ $labels.instance }}).\"\n description: \"Prometheus configuration reload error.\"\n - alert: PrometheusTooManyRestarts\n expr: changes(process_start_time_seconds{job=~\"prometheus|pushgateway|alertmanager\"}[15m]) \u003e 2\n for: 0m\n labels:\n severity: warning\n annotations:\n summary: \"Prometheus too many restarts (instance {{ $labels.instance }}).\"\n description: \"Prometheus has restarted more than twice in the last 15 minutes. It might be crashlooping.\"\n - alert: PrometheusAlertmanagerConfigurationReloadFailure\n expr: alertmanager_config_last_reload_successful != 1\n for: 0m\n labels:\n severity: warning\n annotations:\n summary: \"Prometheus AlertManager configuration reload failure (instance {{ $labels.instance }}).\"\n description: \"AlertManager configuration reload error.\"\n - alert: PrometheusRuleEvaluationFailures\n expr: increase(prometheus_rule_evaluation_failures_total[3m]) \u003e 0\n for: 0m\n labels:\n severity: critical\n annotations:\n summary: \"Prometheus rule evaluation failures (instance {{ $labels.instance }}).\"\n description: \"Prometheus encountered {{ $value }} rule evaluation failures, leading to potentially ignored alerts.\"\n - alert: PrometheusTargetScrapingSlow\n expr: prometheus_target_interval_length_seconds{quantile=\"0.9\"} \u003e 60\n for: 5m\n labels:\n severity: warning\n annotations:\n summary: \"Prometheus target scraping slow (instance {{ $labels.instance }}).\"\n description: \"Prometheus is scraping exporters slowly.\"\n - alert: PrometheusTsdbCompactionsFailed\n expr: increase(prometheus_tsdb_compactions_failed_total[1m]) \u003e 0\n for: 0m\n labels:\n severity: critical\n annotations:\n summary: \"Prometheus TSDB compactions failed (instance {{ $labels.instance }}).\"\n description: \"Prometheus encountered {{ $value }} TSDB compactions failures.\"\n - alert: PrometheusTsdbHeadTruncationsFailed\n expr: increase(prometheus_tsdb_head_truncations_failed_total[1m]) \u003e 0\n for: 0m\n labels:\n severity: critical\n annotations:\n summary: \"Prometheus TSDB head truncations failed (instance {{ $labels.instance }}).\"\n description: \"Prometheus encountered {{ $value }} TSDB head truncation failures.\"\n - alert: PrometheusTsdbWalCorruptions\n expr: increase(prometheus_tsdb_wal_corruptions_total[1m]) \u003e 0\n for: 0m\n labels:\n severity: critical\n annotations:\n summary: \"Prometheus TSDB WAL corruptions (instance {{ $labels.instance }}).\"\n description: \"Prometheus encountered {{ $value }} TSDB WAL corruptions.\"\n - alert: PrometheusTsdbWalTruncationsFailed\n expr: increase(prometheus_tsdb_wal_truncations_failed_total[1m]) \u003e 0\n for: 0m\n labels:\n severity: critical\n annotations:\n summary: \"Prometheus TSDB WAL truncations failed (instance {{ $labels.instance }}).\"\n description: \"Prometheus encountered {{ $value }} TSDB WAL truncation failures.\"\nEOH\n }\n\n template {\n change_mode = \"noop\"\n change_signal = \"SIGINT\"\n destination = \"secrets/prometheus.yml\"\n data = \u003c\u003cEOH\n---\nglobal:\n scrape_interval: 5s\n scrape_timeout: 5s\n evaluation_interval: 5s\n\nalerting:\n alertmanagers:\n - consul_sd_configs:\n - server: '{{ env \"NOMAD_IP_prometheus\" }}:8500'\n services: [ 'alertmanager' ]\n\nrule_files:\n - 'alerts.yml'\n\nscrape_configs:\n\n - job_name: 'Nomad Cluster'\n consul_sd_configs:\n - server: '{{ env \"NOMAD_IP_prometheus\" }}:8500'\n services: [ 'nomad-client', 'nomad' ]\n relabel_configs:\n - source_labels: [__meta_consul_tags]\n regex: '(.*)http(.*)'\n action: keep\n metrics_path: /v1/metrics\n params:\n format: [ 'prometheus' ]\n\n - job_name: 'Consul Cluster'\n static_configs:\n - targets: [ '10.30.51.28:8500' ]\n - targets: [ '10.30.51.29:8500' ]\n - targets: [ '10.30.51.30:8500' ]\n - targets: [ '10.30.51.32:8500' ]\n - targets: [ '10.30.51.33:8500' ]\n - targets: [ '10.30.51.34:8500' ]\n - targets: [ '10.30.51.35:8500' ]\n - targets: [ '10.30.51.39:8500' ]\n - targets: [ '10.30.51.40:8500' ]\n - targets: [ '10.30.51.50:8500' ]\n - targets: [ '10.30.51.51:8500' ]\n - targets: [ '10.30.51.65:8500' ]\n - targets: [ '10.30.51.66:8500' ]\n - targets: [ '10.30.51.67:8500' ]\n - targets: [ '10.30.51.68:8500' ]\n - targets: [ '10.30.51.70:8500' ]\n - targets: [ '10.30.51.71:8500' ]\n - targets: [ '10.32.8.14:8500' ]\n - targets: [ '10.32.8.15:8500' ]\n - targets: [ '10.32.8.16:8500' ]\n - targets: [ '10.32.8.17:8500' ]\n metrics_path: /v1/agent/metrics\n params:\n format: [ 'prometheus' ]\n\n - job_name: 'Blackbox Exporter (icmp)'\n static_configs:\n - targets: [ 'gerrit.fd.io' ]\n - targets: [ 'jenkins.fd.io' ]\n - targets: [ '10.30.51.32' ]\n params:\n module: [ 'icmp_v4' ]\n relabel_configs:\n - source_labels: [__address__]\n target_label: __param_target\n - source_labels: [__param_target]\n target_label: instance\n - target_label: __address__\n replacement: localhost:9115\n metrics_path: /probe\n\n - job_name: 'Blackbox Exporter (http)'\n static_configs:\n - targets: [ 'gerrit.fd.io' ]\n - targets: [ 'jenkins.fd.io' ]\n params:\n module: [ 'http_2xx' ]\n relabel_configs:\n - source_labels: [__address__]\n target_label: __param_target\n - source_labels: [__param_target]\n target_label: instance\n - target_label: __address__\n replacement: localhost:9115\n metrics_path: /probe\n\n - job_name: 'cAdvisor Exporter'\n static_configs:\n - targets: [ '10.30.51.28:8080' ]\n - targets: [ '10.30.51.29:8080' ]\n - targets: [ '10.30.51.30:8080' ]\n #- targets: [ '10.30.51.32:8080' ]\n - targets: [ '10.30.51.33:8080' ]\n - targets: [ '10.30.51.34:8080' ]\n - targets: [ '10.30.51.35:8080' ]\n - targets: [ '10.30.51.39:8080' ]\n - targets: [ '10.30.51.40:8080' ]\n - targets: [ '10.30.51.50:8080' ]\n - targets: [ '10.30.51.51:8080' ]\n - targets: [ '10.30.51.65:8080' ]\n - targets: [ '10.30.51.66:8080' ]\n - targets: [ '10.30.51.67:8080' ]\n - targets: [ '10.30.51.68:8080' ]\n - targets: [ '10.30.51.70:8080' ]\n - targets: [ '10.30.51.71:8080' ]\n - targets: [ '10.32.8.14:8080' ]\n - targets: [ '10.32.8.15:8080' ]\n - targets: [ '10.32.8.16:8080' ]\n - targets: [ '10.32.8.17:8080' ]\n\n - job_name: 'Node Exporter'\n static_configs:\n - targets: [ '10.30.51.28:9100' ]\n - targets: [ '10.30.51.29:9100' ]\n - targets: [ '10.30.51.30:9100' ]\n - targets: [ '10.30.51.32:9100' ]\n - targets: [ '10.30.51.33:9100' ]\n - targets: [ '10.30.51.34:9100' ]\n - targets: [ '10.30.51.35:9100' ]\n - targets: [ '10.30.51.39:9100' ]\n - targets: [ '10.30.51.40:9100' ]\n - targets: [ '10.30.51.50:9100' ]\n - targets: [ '10.30.51.51:9100' ]\n - targets: [ '10.30.51.65:9100' ]\n - targets: [ '10.30.51.66:9100' ]\n - targets: [ '10.30.51.67:9100' ]\n - targets: [ '10.30.51.68:9100' ]\n - targets: [ '10.30.51.70:9100' ]\n - targets: [ '10.30.51.71:9100' ]\n - targets: [ '10.32.8.14:9100' ]\n - targets: [ '10.32.8.15:9100' ]\n - targets: [ '10.32.8.16:9100' ]\n - targets: [ '10.32.8.17:9100' ]\n\n - job_name: 'Alertmanager'\n consul_sd_configs:\n - server: '{{ env \"NOMAD_IP_prometheus\" }}:8500'\n services: [ 'alertmanager' ]\n\n - job_name: 'Grafana'\n consul_sd_configs:\n - server: '{{ env \"NOMAD_IP_prometheus\" }}:8500'\n services: [ 'grafana' ]\n\n - job_name: 'Prometheus'\n consul_sd_configs:\n - server: '{{ env \"NOMAD_IP_prometheus\" }}:8500'\n services: [ 'prometheus' ]\n\n - job_name: 'Minio'\n bearer_token: eyJhbGciOiJIUzUxMiIsInR5cCI6IkpXVCJ9.eyJleHAiOjQ3NjQ1ODEzMzcsImlzcyI6InByb21ldGhldXMiLCJzdWIiOiJtaW5pbyJ9.oeTw3EIaiFmlDikrHXWiWXMH2vxLfDLkfjEC7G2N3M_keH_xyA_l2ofLLNYtopa_3GCEZnxLQdPuFZrmgpkDWg\n consul_sd_configs:\n - server: '{{ env \"NOMAD_IP_prometheus\" }}:8500'\n services: [ 'storage' ]\n metrics_path: /minio/prometheus/metrics\nEOH\n }\n\n # The service stanza instructs Nomad to register a service with Consul.\n #\n # For more information and examples on the \"task\" stanza, please see\n # the online documentation at:\n #\n # https://www.nomadproject.io/docs/job-specification/service\n #\n service {\n name = \"prometheus\"\n port = \"prometheus\"\n tags = [ \"prometheus${NOMAD_ALLOC_INDEX}\" ]\n check {\n name = \"Prometheus Check Live\"\n type = \"http\"\n path = \"/-/healthy\"\n interval = \"10s\"\n timeout = \"2s\"\n }\n }\n\n # The \"resources\" stanza describes the requirements a task needs to\n # execute. Resource requirements include memory, network, cpu, and more.\n # This ensures the task will execute on a machine that contains enough\n # resource capacity.\n #\n # For more information and examples on the \"resources\" stanza, please see\n # the online documentation at:\n #\n # https://www.nomadproject.io/docs/job-specification/resources\n #\n resources {\n cpu = 2000\n memory = 8192\n # The network stanza specifies the networking requirements for the task\n # group, including the network mode and port allocations. When scheduling\n # jobs in Nomad they are provisioned across your fleet of machines along\n # with other jobs and services. Because you don't know in advance what host\n # your job will be provisioned on, Nomad will provide your tasks with\n # network configuration when they start up.\n #\n # For more information and examples on the \"template\" stanza, please see\n # the online documentation at:\n #\n # https://www.nomadproject.io/docs/job-specification/network\n #\n network {\n port \"prometheus\" {\n static = 9090\n }\n }\n }\n }\n }\n}", + "template": "job \"${job_name}\" {\n # The \"region\" parameter specifies the region in which to execute the job.\n # If omitted, this inherits the default region name of \"global\".\n # region = \"global\"\n #\n # The \"datacenters\" parameter specifies the list of datacenters which should\n # be considered when placing this task. This must be provided.\n datacenters = \"${datacenters}\"\n\n # The \"type\" parameter controls the type of job, which impacts the scheduler's\n # decision on placement. This configuration is optional and defaults to\n # \"service\". For a full list of job types and their differences, please see\n # the online documentation.\n #\n # For more information, please see the online documentation at:\n #\n # https://www.nomadproject.io/docs/jobspec/schedulers\n #\n type = \"service\"\n\n update {\n # The \"max_parallel\" parameter specifies the maximum number of updates to\n # perform in parallel. In this case, this specifies to update a single task\n # at a time.\n max_parallel = 1\n\n health_check = \"checks\"\n\n # The \"min_healthy_time\" parameter specifies the minimum time the allocation\n # must be in the healthy state before it is marked as healthy and unblocks\n # further allocations from being updated.\n min_healthy_time = \"10s\"\n\n # The \"healthy_deadline\" parameter specifies the deadline in which the\n # allocation must be marked as healthy after which the allocation is\n # automatically transitioned to unhealthy. Transitioning to unhealthy will\n # fail the deployment and potentially roll back the job if \"auto_revert\" is\n # set to true.\n healthy_deadline = \"3m\"\n\n # The \"progress_deadline\" parameter specifies the deadline in which an\n # allocation must be marked as healthy. The deadline begins when the first\n # allocation for the deployment is created and is reset whenever an allocation\n # as part of the deployment transitions to a healthy state. If no allocation\n # transitions to the healthy state before the progress deadline, the\n # deployment is marked as failed.\n progress_deadline = \"10m\"\n\n%{ if use_canary }\n # The \"canary\" parameter specifies that changes to the job that would result\n # in destructive updates should create the specified number of canaries\n # without stopping any previous allocations. Once the operator determines the\n # canaries are healthy, they can be promoted which unblocks a rolling update\n # of the remaining allocations at a rate of \"max_parallel\".\n #\n # Further, setting \"canary\" equal to the count of the task group allows\n # blue/green deployments. When the job is updated, a full set of the new\n # version is deployed and upon promotion the old version is stopped.\n canary = 1\n\n # Specifies if the job should auto-promote to the canary version when all\n # canaries become healthy during a deployment. Defaults to false which means\n # canaries must be manually updated with the nomad deployment promote\n # command.\n auto_promote = true\n\n # The \"auto_revert\" parameter specifies if the job should auto-revert to the\n # last stable job on deployment failure. A job is marked as stable if all the\n # allocations as part of its deployment were marked healthy.\n auto_revert = true\n%{ endif }\n }\n\n # The \"group\" stanza defines a series of tasks that should be co-located on\n # the same Nomad client. Any task within a group will be placed on the same\n # client.\n #\n # For more information and examples on the \"group\" stanza, please see\n # the online documentation at:\n #\n # https://www.nomadproject.io/docs/job-specification/group\n #\n group \"prod-group1-${service_name}\" {\n # The \"count\" parameter specifies the number of the task groups that should\n # be running under this group. This value must be non-negative and defaults\n # to 1.\n count = ${group_count}\n\n # The volume stanza allows the group to specify that it requires a given\n # volume from the cluster.\n #\n # For more information and examples on the \"volume\" stanza, please see\n # the online documentation at:\n #\n # https://www.nomadproject.io/docs/job-specification/volume\n #\n %{ if use_host_volume }\n volume \"prod-volume1-${service_name}\" {\n type = \"host\"\n read_only = false\n source = \"${host_volume}\"\n }\n %{ endif }\n\n # The constraint allows restricting the set of eligible nodes. Constraints\n # may filter on attributes or client metadata.\n #\n # For more information and examples on the \"volume\" stanza, please see\n # the online documentation at:\n #\n # https://www.nomadproject.io/docs/job-specification/constraint\n #\n constraint {\n attribute = \"$${attr.cpu.arch}\"\n operator = \"!=\"\n value = \"arm64\"\n }\n\n # The \"task\" stanza creates an individual unit of work, such as a Docker\n # container, web application, or batch processing.\n #\n # For more information and examples on the \"task\" stanza, please see\n # the online documentation at:\n #\n # https://www.nomadproject.io/docs/job-specification/task\n #\n task \"prod-task1-${service_name}\" {\n # The \"driver\" parameter specifies the task driver that should be used to\n # run the task.\n driver = \"exec\"\n\n %{ if use_host_volume }\n volume_mount {\n volume = \"prod-volume1-${service_name}\"\n destination = \"${data_dir}\"\n read_only = false\n }\n %{ endif }\n\n %{ if use_vault_provider }\n vault {\n policies = \"${vault_kv_policy_name}\"\n }\n %{ endif }\n\n # The \"config\" stanza specifies the driver configuration, which is passed\n # directly to the driver to start the task. The details of configurations\n # are specific to each driver, so please see specific driver\n # documentation for more information.\n config {\n command = \"local/prometheus-${version}.linux-amd64/prometheus\"\n args = [\n \"--config.file=secrets/prometheus.yml\",\n \"--storage.tsdb.path=${data_dir}prometheus/\",\n \"--storage.tsdb.retention.time=15d\"\n ]\n }\n\n # The artifact stanza instructs Nomad to fetch and unpack a remote resource,\n # such as a file, tarball, or binary. Nomad downloads artifacts using the\n # popular go-getter library, which permits downloading artifacts from a\n # variety of locations using a URL as the input source.\n #\n # For more information and examples on the \"artifact\" stanza, please see\n # the online documentation at:\n #\n # https://www.nomadproject.io/docs/job-specification/artifact\n #\n artifact {\n source = \"${url}\"\n }\n\n # The \"template\" stanza instructs Nomad to manage a template, such as\n # a configuration file or script. This template can optionally pull data\n # from Consul or Vault to populate runtime configuration data.\n #\n # For more information and examples on the \"template\" stanza, please see\n # the online documentation at:\n #\n # https://www.nomadproject.io/docs/job-specification/template\n #\n template {\n change_mode = \"noop\"\n change_signal = \"SIGINT\"\n destination = \"secrets/alerts.yml\"\n left_delimiter = \"{{{\"\n right_delimiter = \"}}}\"\n data = \u003c\u003cEOH\n---\ngroups:\n- name: \"Consul\"\n rules:\n - alert: ConsulServiceHealthcheckFailed\n expr: consul_catalog_service_node_healthy == 0\n for: 0m\n labels:\n severity: critical\n annotations:\n summary: \"Consul service healthcheck failed (instance {{ $labels.instance }}).\"\n description: \"Service: `{{ $labels.service_name }}` Healthcheck: `{{ $labels.service_id }}`.\"\n - alert: ConsulMissingMasterNode\n expr: consul_raft_peers \u003c 3\n for: 0m\n labels:\n severity: critical\n annotations:\n summary: \"Consul missing master node (instance {{ $labels.instance }}).\"\n description: \"Numbers of consul raft peers should be 3, in order to preserve quorum.\"\n - alert: ConsulAgentUnhealthy\n expr: consul_health_node_status{status=\"critical\"} == 1\n for: 0m\n labels:\n severity: critical\n annotations:\n summary: \"Consul agent unhealthy (instance {{ $labels.instance }}).\"\n description: \"A Consul agent is down.\"\n- name: \"Hosts\"\n rules:\n - alert: NodeDown\n expr: up == 0\n for: 0m\n labels:\n severity: critical\n annotations:\n summary: \"Prometheus target missing (instance {{ $labels.instance }}).\"\n description: \"A Prometheus target has disappeared. An exporter might be crashed.\"\n - alert: HostHighCpuLoad\n expr: 100 - (avg by(instance) (rate(node_cpu_seconds_total{mode=\"idle\"}[5m])) * 100) \u003e 95\n for: 0m\n labels:\n severity: warning\n annotations:\n summary: \"Host high CPU load (instance {{ $labels.instance }}).\"\n description: \"CPU load is \u003e 95%.\"\n - alert: HostOutOfMemory\n expr: node_memory_MemAvailable_bytes / node_memory_MemTotal_bytes * 100 \u003c 10\n for: 2m\n labels:\n severity: warning\n annotations:\n summary: \"Host out of memory (instance {{ $labels.instance }}).\"\n description: \"Node memory is filling up (\u003c 10% left).\"\n - alert: HostOomKillDetected\n expr: increase(node_vmstat_oom_kill[1m]) \u003e 0\n for: 0m\n labels:\n severity: warning\n annotations:\n summary: \"Host OOM kill detected (instance {{ $labels.instance }}).\"\n description: \"OOM kill detected.\"\n - alert: HostMemoryUnderMemoryPressure\n expr: rate(node_vmstat_pgmajfault[1m]) \u003e 1000\n for: 2m\n labels:\n severity: warning\n annotations:\n summary: \"Host memory under memory pressure (instance {{ $labels.instance }}).\"\n description: \"The node is under heavy memory pressure. High rate of major page faults.\"\n - alert: HostOutOfDiskSpace\n expr: (node_filesystem_avail_bytes * 100) / node_filesystem_size_bytes \u003c 10 and ON (instance, device, mountpoint) node_filesystem_readonly == 0\n for: 2m\n labels:\n severity: warning\n annotations:\n summary: \"Host out of disk space (instance {{ $labels.instance }}).\"\n description: \"Disk is almost full (\u003c 10% left).\"\n - alert: HostRaidDiskFailure\n expr: node_md_disks{state=\"failed\"} \u003e 0\n for: 2m\n labels:\n severity: warning\n annotations:\n summary: \"Host RAID disk failure (instance {{ $labels.instance }}).\"\n description: \"At least one device in RAID array on {{ $labels.instance }} failed. Array {{ $labels.md_device }} needs attention and possibly a disk swap.\"\n - alert: HostConntrackLimit\n expr: node_nf_conntrack_entries / node_nf_conntrack_entries_limit \u003e 0.8\n for: 5m\n labels:\n severity: warning\n annotations:\n summary: \"Host conntrack limit (instance {{ $labels.instance }}).\"\n description: \"The number of conntrack is approching limit.\"\n - alert: HostNetworkInterfaceSaturated\n expr: (rate(node_network_receive_bytes_total{device!~\"^tap.*\"}[1m]) + rate(node_network_transmit_bytes_total{device!~\"^tap.*\"}[1m])) / node_network_speed_bytes{device!~\"^tap.*\"} \u003e 0.8\n for: 1m\n labels:\n severity: warning\n annotations:\n summary: \"Host Network Interface Saturated (instance {{ $labels.instance }}).\"\n description: \"The network interface {{ $labels.interface }} on {{ $labels.instance }} is getting overloaded.\"\n - alert: HostSystemdServiceCrashed\n expr: node_systemd_unit_state{state=\"failed\"} == 1\n for: 0m\n labels:\n severity: warning\n annotations:\n summary: \"Host SystemD service crashed (instance {{ $labels.instance }}).\"\n description: \"SystemD service crashed.\"\n - alert: HostEdacCorrectableErrorsDetected\n expr: increase(node_edac_correctable_errors_total[1m]) \u003e 0\n for: 0m\n labels:\n severity: info\n annotations:\n summary: \"Host EDAC Correctable Errors detected (instance {{ $labels.instance }}).\"\n description: '{{ $labels.instance }} has had {{ printf \"%.0f\" $value }} correctable memory errors reported by EDAC in the last 5 minutes.'\n - alert: HostEdacUncorrectableErrorsDetected\n expr: node_edac_uncorrectable_errors_total \u003e 0\n for: 0m\n labels:\n severity: warning\n annotations:\n summary: \"Host EDAC Uncorrectable Errors detected (instance {{ $labels.instance }}).\"\n description: '{{ $labels.instance }} has had {{ printf \"%.0f\" $value }} uncorrectable memory errors reported by EDAC in the last 5 minutes.'\n- name: \"Min.io\"\n rules:\n - alert: MinioDiskOffline\n expr: minio_offline_disks \u003e 0\n for: 0m\n labels:\n severity: critical\n annotations:\n summary: \"Minio disk offline (instance {{ $labels.instance }})\"\n description: \"Minio disk is offline.\"\n - alert: MinioStorageSpaceExhausted\n expr: minio_disk_storage_free_bytes / 1024 / 1024 / 1024 \u003c 10\n for: 2m\n labels:\n severity: warning\n annotations:\n summary: \"Minio storage space exhausted (instance {{ $labels.instance }}).\"\n description: \"Minio storage space is low (\u003c 10 GB).\"\n- name: \"Prometheus\"\n rules:\n - alert: PrometheusConfigurationReloadFailure\n expr: prometheus_config_last_reload_successful != 1\n for: 0m\n labels:\n severity: warning\n annotations:\n summary: \"Prometheus configuration reload failure (instance {{ $labels.instance }}).\"\n description: \"Prometheus configuration reload error.\"\n - alert: PrometheusTooManyRestarts\n expr: changes(process_start_time_seconds{job=~\"prometheus|pushgateway|alertmanager\"}[15m]) \u003e 2\n for: 0m\n labels:\n severity: warning\n annotations:\n summary: \"Prometheus too many restarts (instance {{ $labels.instance }}).\"\n description: \"Prometheus has restarted more than twice in the last 15 minutes. It might be crashlooping.\"\n - alert: PrometheusAlertmanagerConfigurationReloadFailure\n expr: alertmanager_config_last_reload_successful != 1\n for: 0m\n labels:\n severity: warning\n annotations:\n summary: \"Prometheus AlertManager configuration reload failure (instance {{ $labels.instance }}).\"\n description: \"AlertManager configuration reload error.\"\n - alert: PrometheusRuleEvaluationFailures\n expr: increase(prometheus_rule_evaluation_failures_total[3m]) \u003e 0\n for: 0m\n labels:\n severity: critical\n annotations:\n summary: \"Prometheus rule evaluation failures (instance {{ $labels.instance }}).\"\n description: \"Prometheus encountered {{ $value }} rule evaluation failures, leading to potentially ignored alerts.\"\n - alert: PrometheusTargetScrapingSlow\n expr: prometheus_target_interval_length_seconds{quantile=\"0.9\"} \u003e 60\n for: 5m\n labels:\n severity: warning\n annotations:\n summary: \"Prometheus target scraping slow (instance {{ $labels.instance }}).\"\n description: \"Prometheus is scraping exporters slowly.\"\n - alert: PrometheusTsdbCompactionsFailed\n expr: increase(prometheus_tsdb_compactions_failed_total[1m]) \u003e 0\n for: 0m\n labels:\n severity: critical\n annotations:\n summary: \"Prometheus TSDB compactions failed (instance {{ $labels.instance }}).\"\n description: \"Prometheus encountered {{ $value }} TSDB compactions failures.\"\n - alert: PrometheusTsdbHeadTruncationsFailed\n expr: increase(prometheus_tsdb_head_truncations_failed_total[1m]) \u003e 0\n for: 0m\n labels:\n severity: critical\n annotations:\n summary: \"Prometheus TSDB head truncations failed (instance {{ $labels.instance }}).\"\n description: \"Prometheus encountered {{ $value }} TSDB head truncation failures.\"\n - alert: PrometheusTsdbWalCorruptions\n expr: increase(prometheus_tsdb_wal_corruptions_total[1m]) \u003e 0\n for: 0m\n labels:\n severity: critical\n annotations:\n summary: \"Prometheus TSDB WAL corruptions (instance {{ $labels.instance }}).\"\n description: \"Prometheus encountered {{ $value }} TSDB WAL corruptions.\"\n - alert: PrometheusTsdbWalTruncationsFailed\n expr: increase(prometheus_tsdb_wal_truncations_failed_total[1m]) \u003e 0\n for: 0m\n labels:\n severity: critical\n annotations:\n summary: \"Prometheus TSDB WAL truncations failed (instance {{ $labels.instance }}).\"\n description: \"Prometheus encountered {{ $value }} TSDB WAL truncation failures.\"\nEOH\n }\n\n template {\n change_mode = \"noop\"\n change_signal = \"SIGINT\"\n destination = \"secrets/prometheus.yml\"\n data = \u003c\u003cEOH\n---\nglobal:\n scrape_interval: 5s\n scrape_timeout: 5s\n evaluation_interval: 5s\n\nalerting:\n alertmanagers:\n - consul_sd_configs:\n - server: '{{ env \"NOMAD_IP_prometheus\" }}:8500'\n services: [ 'alertmanager' ]\n\nrule_files:\n - 'alerts.yml'\n\nscrape_configs:\n\n - job_name: 'Nomad Cluster'\n consul_sd_configs:\n - server: '{{ env \"NOMAD_IP_prometheus\" }}:8500'\n services: [ 'nomad-client', 'nomad' ]\n relabel_configs:\n - source_labels: [__meta_consul_tags]\n regex: '(.*)http(.*)'\n action: keep\n metrics_path: /v1/metrics\n params:\n format: [ 'prometheus' ]\n\n - job_name: 'Consul Cluster'\n static_configs:\n - targets: [ '10.30.51.28:8500' ]\n - targets: [ '10.30.51.29:8500' ]\n - targets: [ '10.30.51.30:8500' ]\n - targets: [ '10.30.51.32:8500' ]\n - targets: [ '10.30.51.33:8500' ]\n - targets: [ '10.30.51.34:8500' ]\n - targets: [ '10.30.51.35:8500' ]\n - targets: [ '10.30.51.39:8500' ]\n - targets: [ '10.30.51.40:8500' ]\n - targets: [ '10.30.51.50:8500' ]\n - targets: [ '10.30.51.51:8500' ]\n - targets: [ '10.30.51.65:8500' ]\n - targets: [ '10.30.51.66:8500' ]\n - targets: [ '10.30.51.67:8500' ]\n - targets: [ '10.30.51.68:8500' ]\n - targets: [ '10.30.51.70:8500' ]\n - targets: [ '10.30.51.71:8500' ]\n - targets: [ '10.32.8.14:8500' ]\n - targets: [ '10.32.8.15:8500' ]\n - targets: [ '10.32.8.16:8500' ]\n - targets: [ '10.32.8.17:8500' ]\n metrics_path: /v1/agent/metrics\n params:\n format: [ 'prometheus' ]\n\n - job_name: 'Blackbox Exporter (icmp)'\n static_configs:\n - targets: [ 'gerrit.fd.io' ]\n - targets: [ 'jenkins.fd.io' ]\n - targets: [ '10.30.51.32' ]\n params:\n module: [ 'icmp_v4' ]\n relabel_configs:\n - source_labels: [__address__]\n target_label: __param_target\n - source_labels: [__param_target]\n target_label: instance\n - target_label: __address__\n replacement: localhost:9115\n metrics_path: /probe\n\n - job_name: 'Blackbox Exporter (http)'\n static_configs:\n - targets: [ 'gerrit.fd.io' ]\n - targets: [ 'jenkins.fd.io' ]\n params:\n module: [ 'http_2xx' ]\n relabel_configs:\n - source_labels: [__address__]\n target_label: __param_target\n - source_labels: [__param_target]\n target_label: instance\n - target_label: __address__\n replacement: localhost:9115\n metrics_path: /probe\n\n - job_name: 'cAdvisor Exporter'\n static_configs:\n - targets: [ '10.30.51.28:8080' ]\n - targets: [ '10.30.51.29:8080' ]\n - targets: [ '10.30.51.30:8080' ]\n #- targets: [ '10.30.51.32:8080' ]\n - targets: [ '10.30.51.33:8080' ]\n - targets: [ '10.30.51.34:8080' ]\n - targets: [ '10.30.51.35:8080' ]\n - targets: [ '10.30.51.39:8080' ]\n - targets: [ '10.30.51.40:8080' ]\n - targets: [ '10.30.51.50:8080' ]\n - targets: [ '10.30.51.51:8080' ]\n - targets: [ '10.30.51.65:8080' ]\n - targets: [ '10.30.51.66:8080' ]\n - targets: [ '10.30.51.67:8080' ]\n - targets: [ '10.30.51.68:8080' ]\n - targets: [ '10.30.51.70:8080' ]\n - targets: [ '10.30.51.71:8080' ]\n - targets: [ '10.32.8.14:8080' ]\n - targets: [ '10.32.8.15:8080' ]\n - targets: [ '10.32.8.16:8080' ]\n - targets: [ '10.32.8.17:8080' ]\n\n - job_name: 'Node Exporter'\n static_configs:\n - targets: [ '10.30.51.28:9100' ]\n - targets: [ '10.30.51.29:9100' ]\n - targets: [ '10.30.51.30:9100' ]\n - targets: [ '10.30.51.32:9100' ]\n - targets: [ '10.30.51.33:9100' ]\n - targets: [ '10.30.51.34:9100' ]\n - targets: [ '10.30.51.35:9100' ]\n - targets: [ '10.30.51.39:9100' ]\n - targets: [ '10.30.51.40:9100' ]\n - targets: [ '10.30.51.50:9100' ]\n - targets: [ '10.30.51.51:9100' ]\n - targets: [ '10.30.51.65:9100' ]\n - targets: [ '10.30.51.66:9100' ]\n - targets: [ '10.30.51.67:9100' ]\n - targets: [ '10.30.51.68:9100' ]\n - targets: [ '10.30.51.70:9100' ]\n - targets: [ '10.30.51.71:9100' ]\n - targets: [ '10.32.8.14:9100' ]\n - targets: [ '10.32.8.15:9100' ]\n - targets: [ '10.32.8.16:9100' ]\n - targets: [ '10.32.8.17:9100' ]\n\n - job_name: 'Alertmanager'\n consul_sd_configs:\n - server: '{{ env \"NOMAD_IP_prometheus\" }}:8500'\n services: [ 'alertmanager' ]\n\n - job_name: 'Grafana'\n consul_sd_configs:\n - server: '{{ env \"NOMAD_IP_prometheus\" }}:8500'\n services: [ 'grafana' ]\n\n - job_name: 'Prometheus'\n consul_sd_configs:\n - server: '{{ env \"NOMAD_IP_prometheus\" }}:8500'\n services: [ 'prometheus' ]\n\n - job_name: 'Minio'\n bearer_token: eyJhbGciOiJIUzUxMiIsInR5cCI6IkpXVCJ9.eyJleHAiOjQ3NjQ1ODEzMzcsImlzcyI6InByb21ldGhldXMiLCJzdWIiOiJtaW5pbyJ9.oeTw3EIaiFmlDikrHXWiWXMH2vxLfDLkfjEC7G2N3M_keH_xyA_l2ofLLNYtopa_3GCEZnxLQdPuFZrmgpkDWg\n consul_sd_configs:\n - server: '{{ env \"NOMAD_IP_prometheus\" }}:8500'\n services: [ 'storage' ]\n metrics_path: /minio/prometheus/metrics\nEOH\n }\n\n # The service stanza instructs Nomad to register a service with Consul.\n #\n # For more information and examples on the \"task\" stanza, please see\n # the online documentation at:\n #\n # https://www.nomadproject.io/docs/job-specification/service\n #\n service {\n name = \"${service_name}\"\n port = \"${service_name}\"\n tags = [ \"${service_name}$${NOMAD_ALLOC_INDEX}\" ]\n check {\n name = \"Prometheus Check Live\"\n type = \"http\"\n path = \"/-/healthy\"\n interval = \"10s\"\n timeout = \"2s\"\n }\n }\n\n # The \"resources\" stanza describes the requirements a task needs to\n # execute. Resource requirements include memory, network, cpu, and more.\n # This ensures the task will execute on a machine that contains enough\n # resource capacity.\n #\n # For more information and examples on the \"resources\" stanza, please see\n # the online documentation at:\n #\n # https://www.nomadproject.io/docs/job-specification/resources\n #\n resources {\n cpu = ${cpu}\n memory = ${mem}\n # The network stanza specifies the networking requirements for the task\n # group, including the network mode and port allocations. When scheduling\n # jobs in Nomad they are provisioned across your fleet of machines along\n # with other jobs and services. Because you don't know in advance what host\n # your job will be provisioned on, Nomad will provide your tasks with\n # network configuration when they start up.\n #\n # For more information and examples on the \"template\" stanza, please see\n # the online documentation at:\n #\n # https://www.nomadproject.io/docs/job-specification/network\n #\n network {\n port \"${service_name}\" {\n static = ${port}\n }\n }\n }\n }\n }\n}", "vars": { "cpu": "2000", "data_dir": "/data/", @@ -660,10 +512,10 @@ "schema_version": 0, "attributes": { "allocation_ids": [ - "79c4c6b7-e8e4-79e7-9dae-9a26142cd94c", - "2e11f44e-814e-db74-a9ee-8bcbebb1fa0b", - "83f9c9aa-f4f2-b832-a937-aaee4181493e", - "231f1338-a871-b44e-d545-e8af01d7eebe" + "81af62a3-09d6-5e29-a63a-e13952f9caec", + "1516d874-0001-78b1-f6ec-727067af2f29", + "fcf4df05-3083-e96a-9e8b-0856c0fed8f9", + "6eaada1d-5bea-7b95-6564-7c0ff31b3053" ], "datacenters": [ "yul1" @@ -674,9 +526,9 @@ "deregister_on_id_change": true, "detach": false, "id": "prod-prometheus", - "jobspec": "job \"prod-prometheus\" {\n # The \"region\" parameter specifies the region in which to execute the job.\n # If omitted, this inherits the default region name of \"global\".\n # region = \"global\"\n #\n # The \"datacenters\" parameter specifies the list of datacenters which should\n # be considered when placing this task. This must be provided.\n datacenters = \"yul1\"\n\n # The \"type\" parameter controls the type of job, which impacts the scheduler's\n # decision on placement. This configuration is optional and defaults to\n # \"service\". For a full list of job types and their differences, please see\n # the online documentation.\n #\n # For more information, please see the online documentation at:\n #\n # https://www.nomadproject.io/docs/jobspec/schedulers\n #\n type = \"service\"\n\n update {\n # The \"max_parallel\" parameter specifies the maximum number of updates to\n # perform in parallel. In this case, this specifies to update a single task\n # at a time.\n max_parallel = 1\n\n health_check = \"checks\"\n\n # The \"min_healthy_time\" parameter specifies the minimum time the allocation\n # must be in the healthy state before it is marked as healthy and unblocks\n # further allocations from being updated.\n min_healthy_time = \"10s\"\n\n # The \"healthy_deadline\" parameter specifies the deadline in which the\n # allocation must be marked as healthy after which the allocation is\n # automatically transitioned to unhealthy. Transitioning to unhealthy will\n # fail the deployment and potentially roll back the job if \"auto_revert\" is\n # set to true.\n healthy_deadline = \"3m\"\n\n # The \"progress_deadline\" parameter specifies the deadline in which an\n # allocation must be marked as healthy. The deadline begins when the first\n # allocation for the deployment is created and is reset whenever an allocation\n # as part of the deployment transitions to a healthy state. If no allocation\n # transitions to the healthy state before the progress deadline, the\n # deployment is marked as failed.\n progress_deadline = \"10m\"\n\n\n # The \"canary\" parameter specifies that changes to the job that would result\n # in destructive updates should create the specified number of canaries\n # without stopping any previous allocations. Once the operator determines the\n # canaries are healthy, they can be promoted which unblocks a rolling update\n # of the remaining allocations at a rate of \"max_parallel\".\n #\n # Further, setting \"canary\" equal to the count of the task group allows\n # blue/green deployments. When the job is updated, a full set of the new\n # version is deployed and upon promotion the old version is stopped.\n canary = 1\n\n # Specifies if the job should auto-promote to the canary version when all\n # canaries become healthy during a deployment. Defaults to false which means\n # canaries must be manually updated with the nomad deployment promote\n # command.\n auto_promote = true\n\n # The \"auto_revert\" parameter specifies if the job should auto-revert to the\n # last stable job on deployment failure. A job is marked as stable if all the\n # allocations as part of its deployment were marked healthy.\n auto_revert = true\n\n }\n\n # The \"group\" stanza defines a series of tasks that should be co-located on\n # the same Nomad client. Any task within a group will be placed on the same\n # client.\n #\n # For more information and examples on the \"group\" stanza, please see\n # the online documentation at:\n #\n # https://www.nomadproject.io/docs/job-specification/group\n #\n group \"prod-group1-prometheus\" {\n # The \"count\" parameter specifies the number of the task groups that should\n # be running under this group. This value must be non-negative and defaults\n # to 1.\n count = 4\n\n # The volume stanza allows the group to specify that it requires a given\n # volume from the cluster.\n #\n # For more information and examples on the \"volume\" stanza, please see\n # the online documentation at:\n #\n # https://www.nomadproject.io/docs/job-specification/volume\n #\n \n volume \"prod-volume1-prometheus\" {\n type = \"host\"\n read_only = false\n source = \"prod-volume-data1-1\"\n }\n \n\n # The constraint allows restricting the set of eligible nodes. Constraints\n # may filter on attributes or client metadata.\n #\n # For more information and examples on the \"volume\" stanza, please see\n # the online documentation at:\n #\n # https://www.nomadproject.io/docs/job-specification/constraint\n #\n constraint {\n attribute = \"${attr.cpu.arch}\"\n operator = \"!=\"\n value = \"arm64\"\n }\n\n # The \"task\" stanza creates an individual unit of work, such as a Docker\n # container, web application, or batch processing.\n #\n # For more information and examples on the \"task\" stanza, please see\n # the online documentation at:\n #\n # https://www.nomadproject.io/docs/job-specification/task\n #\n task \"prod-task1-prometheus\" {\n # The \"driver\" parameter specifies the task driver that should be used to\n # run the task.\n driver = \"exec\"\n\n \n volume_mount {\n volume = \"prod-volume1-prometheus\"\n destination = \"/data/\"\n read_only = false\n }\n \n\n \n\n # The \"config\" stanza specifies the driver configuration, which is passed\n # directly to the driver to start the task. The details of configurations\n # are specific to each driver, so please see specific driver\n # documentation for more information.\n config {\n command = \"local/prometheus-2.24.0.linux-amd64/prometheus\"\n args = [\n \"--config.file=secrets/prometheus.yml\",\n \"--storage.tsdb.path=/data/prometheus/\",\n \"--storage.tsdb.retention.time=15d\"\n ]\n }\n\n # The artifact stanza instructs Nomad to fetch and unpack a remote resource,\n # such as a file, tarball, or binary. Nomad downloads artifacts using the\n # popular go-getter library, which permits downloading artifacts from a\n # variety of locations using a URL as the input source.\n #\n # For more information and examples on the \"artifact\" stanza, please see\n # the online documentation at:\n #\n # https://www.nomadproject.io/docs/job-specification/artifact\n #\n artifact {\n source = \"https://github.com/prometheus/prometheus/releases/download/v2.24.0/prometheus-2.24.0.linux-amd64.tar.gz\"\n }\n\n # The \"template\" stanza instructs Nomad to manage a template, such as\n # a configuration file or script. This template can optionally pull data\n # from Consul or Vault to populate runtime configuration data.\n #\n # For more information and examples on the \"template\" stanza, please see\n # the online documentation at:\n #\n # https://www.nomadproject.io/docs/job-specification/template\n #\n template {\n change_mode = \"noop\"\n change_signal = \"SIGINT\"\n destination = \"secrets/alerts.yml\"\n left_delimiter = \"{{{\"\n right_delimiter = \"}}}\"\n data = \u003c\u003cEOH\n---\ngroups:\n- name: \"Consul\"\n rules:\n - alert: ConsulServiceHealthcheckFailed\n expr: consul_catalog_service_node_healthy == 0\n for: 0m\n labels:\n severity: critical\n annotations:\n summary: \"Consul service healthcheck failed (instance {{ $labels.instance }}).\"\n description: \"Service: `{{ $labels.service_name }}` Healthcheck: `{{ $labels.service_id }}`.\"\n - alert: ConsulMissingMasterNode\n expr: consul_raft_peers \u003c 3\n for: 0m\n labels:\n severity: critical\n annotations:\n summary: \"Consul missing master node (instance {{ $labels.instance }}).\"\n description: \"Numbers of consul raft peers should be 3, in order to preserve quorum.\"\n - alert: ConsulAgentUnhealthy\n expr: consul_health_node_status{status=\"critical\"} == 1\n for: 0m\n labels:\n severity: critical\n annotations:\n summary: \"Consul agent unhealthy (instance {{ $labels.instance }}).\"\n description: \"A Consul agent is down.\"\n- name: \"Hosts\"\n rules:\n - alert: NodeDown\n expr: up == 0\n for: 0m\n labels:\n severity: critical\n annotations:\n summary: \"Prometheus target missing (instance {{ $labels.instance }}).\"\n description: \"A Prometheus target has disappeared. An exporter might be crashed.\"\n - alert: HostHighCpuLoad\n expr: 100 - (avg by(instance) (rate(node_cpu_seconds_total{mode=\"idle\"}[2m])) * 100) \u003e 80\n for: 0m\n labels:\n severity: warning\n annotations:\n summary: \"Host high CPU load (instance {{ $labels.instance }}).\"\n description: \"CPU load is \u003e 80%.\"\n - alert: HostOutOfMemory\n expr: node_memory_MemAvailable_bytes / node_memory_MemTotal_bytes * 100 \u003c 10\n for: 2m\n labels:\n severity: warning\n annotations:\n summary: \"Host out of memory (instance {{ $labels.instance }}).\"\n description: \"Node memory is filling up (\u003c 10% left).\"\n - alert: HostOomKillDetected\n expr: increase(node_vmstat_oom_kill[1m]) \u003e 0\n for: 0m\n labels:\n severity: warning\n annotations:\n summary: \"Host OOM kill detected (instance {{ $labels.instance }}).\"\n description: \"OOM kill detected.\"\n - alert: HostMemoryUnderMemoryPressure\n expr: rate(node_vmstat_pgmajfault[1m]) \u003e 1000\n for: 2m\n labels:\n severity: warning\n annotations:\n summary: \"Host memory under memory pressure (instance {{ $labels.instance }}).\"\n description: \"The node is under heavy memory pressure. High rate of major page faults.\"\n - alert: HostOutOfDiskSpace\n expr: (node_filesystem_avail_bytes * 100) / node_filesystem_size_bytes \u003c 10 and ON (instance, device, mountpoint) node_filesystem_readonly == 0\n for: 2m\n labels:\n severity: warning\n annotations:\n summary: \"Host out of disk space (instance {{ $labels.instance }}).\"\n description: \"Disk is almost full (\u003c 10% left).\"\n - alert: HostRaidDiskFailure\n expr: node_md_disks{state=\"failed\"} \u003e 0\n for: 2m\n labels:\n severity: warning\n annotations:\n summary: \"Host RAID disk failure (instance {{ $labels.instance }}).\"\n description: \"At least one device in RAID array on {{ $labels.instance }} failed. Array {{ $labels.md_device }} needs attention and possibly a disk swap.\"\n - alert: HostConntrackLimit\n expr: node_nf_conntrack_entries / node_nf_conntrack_entries_limit \u003e 0.8\n for: 5m\n labels:\n severity: warning\n annotations:\n summary: \"Host conntrack limit (instance {{ $labels.instance }}).\"\n description: \"The number of conntrack is approching limit.\"\n - alert: HostNetworkInterfaceSaturated\n expr: (rate(node_network_receive_bytes_total{device!~\"^tap.*\"}[1m]) + rate(node_network_transmit_bytes_total{device!~\"^tap.*\"}[1m])) / node_network_speed_bytes{device!~\"^tap.*\"} \u003e 0.8\n for: 1m\n labels:\n severity: warning\n annotations:\n summary: \"Host Network Interface Saturated (instance {{ $labels.instance }}).\"\n description: \"The network interface {{ $labels.interface }} on {{ $labels.instance }} is getting overloaded.\"\n - alert: HostSystemdServiceCrashed\n expr: node_systemd_unit_state{state=\"failed\"} == 1\n for: 0m\n labels:\n severity: warning\n annotations:\n summary: \"Host SystemD service crashed (instance {{ $labels.instance }}).\"\n description: \"SystemD service crashed.\"\n - alert: HostEdacCorrectableErrorsDetected\n expr: increase(node_edac_correctable_errors_total[1m]) \u003e 0\n for: 0m\n labels:\n severity: info\n annotations:\n summary: \"Host EDAC Correctable Errors detected (instance {{ $labels.instance }}).\"\n description: '{{ $labels.instance }} has had {{ printf \"%.0f\" $value }} correctable memory errors reported by EDAC in the last 5 minutes.'\n - alert: HostEdacUncorrectableErrorsDetected\n expr: node_edac_uncorrectable_errors_total \u003e 0\n for: 0m\n labels:\n severity: warning\n annotations:\n summary: \"Host EDAC Uncorrectable Errors detected (instance {{ $labels.instance }}).\"\n description: '{{ $labels.instance }} has had {{ printf \"%.0f\" $value }} uncorrectable memory errors reported by EDAC in the last 5 minutes.'\n- name: \"Min.io\"\n rules:\n - alert: MinioDiskOffline\n expr: minio_offline_disks \u003e 0\n for: 0m\n labels:\n severity: critical\n annotations:\n summary: \"Minio disk offline (instance {{ $labels.instance }})\"\n description: \"Minio disk is offline.\"\n - alert: MinioStorageSpaceExhausted\n expr: minio_disk_storage_free_bytes / 1024 / 1024 / 1024 \u003c 10\n for: 2m\n labels:\n severity: warning\n annotations:\n summary: \"Minio storage space exhausted (instance {{ $labels.instance }}).\"\n description: \"Minio storage space is low (\u003c 10 GB).\"\n- name: \"Prometheus\"\n rules:\n - alert: PrometheusConfigurationReloadFailure\n expr: prometheus_config_last_reload_successful != 1\n for: 0m\n labels:\n severity: warning\n annotations:\n summary: \"Prometheus configuration reload failure (instance {{ $labels.instance }}).\"\n description: \"Prometheus configuration reload error.\"\n - alert: PrometheusTooManyRestarts\n expr: changes(process_start_time_seconds{job=~\"prometheus|pushgateway|alertmanager\"}[15m]) \u003e 2\n for: 0m\n labels:\n severity: warning\n annotations:\n summary: \"Prometheus too many restarts (instance {{ $labels.instance }}).\"\n description: \"Prometheus has restarted more than twice in the last 15 minutes. It might be crashlooping.\"\n - alert: PrometheusAlertmanagerConfigurationReloadFailure\n expr: alertmanager_config_last_reload_successful != 1\n for: 0m\n labels:\n severity: warning\n annotations:\n summary: \"Prometheus AlertManager configuration reload failure (instance {{ $labels.instance }}).\"\n description: \"AlertManager configuration reload error.\"\n - alert: PrometheusRuleEvaluationFailures\n expr: increase(prometheus_rule_evaluation_failures_total[3m]) \u003e 0\n for: 0m\n labels:\n severity: critical\n annotations:\n summary: \"Prometheus rule evaluation failures (instance {{ $labels.instance }}).\"\n description: \"Prometheus encountered {{ $value }} rule evaluation failures, leading to potentially ignored alerts.\"\n - alert: PrometheusTargetScrapingSlow\n expr: prometheus_target_interval_length_seconds{quantile=\"0.9\"} \u003e 60\n for: 5m\n labels:\n severity: warning\n annotations:\n summary: \"Prometheus target scraping slow (instance {{ $labels.instance }}).\"\n description: \"Prometheus is scraping exporters slowly.\"\n - alert: PrometheusTsdbCompactionsFailed\n expr: increase(prometheus_tsdb_compactions_failed_total[1m]) \u003e 0\n for: 0m\n labels:\n severity: critical\n annotations:\n summary: \"Prometheus TSDB compactions failed (instance {{ $labels.instance }}).\"\n description: \"Prometheus encountered {{ $value }} TSDB compactions failures.\"\n - alert: PrometheusTsdbHeadTruncationsFailed\n expr: increase(prometheus_tsdb_head_truncations_failed_total[1m]) \u003e 0\n for: 0m\n labels:\n severity: critical\n annotations:\n summary: \"Prometheus TSDB head truncations failed (instance {{ $labels.instance }}).\"\n description: \"Prometheus encountered {{ $value }} TSDB head truncation failures.\"\n - alert: PrometheusTsdbWalCorruptions\n expr: increase(prometheus_tsdb_wal_corruptions_total[1m]) \u003e 0\n for: 0m\n labels:\n severity: critical\n annotations:\n summary: \"Prometheus TSDB WAL corruptions (instance {{ $labels.instance }}).\"\n description: \"Prometheus encountered {{ $value }} TSDB WAL corruptions.\"\n - alert: PrometheusTsdbWalTruncationsFailed\n expr: increase(prometheus_tsdb_wal_truncations_failed_total[1m]) \u003e 0\n for: 0m\n labels:\n severity: critical\n annotations:\n summary: \"Prometheus TSDB WAL truncations failed (instance {{ $labels.instance }}).\"\n description: \"Prometheus encountered {{ $value }} TSDB WAL truncation failures.\"\nEOH\n }\n\n template {\n change_mode = \"noop\"\n change_signal = \"SIGINT\"\n destination = \"secrets/prometheus.yml\"\n data = \u003c\u003cEOH\n---\nglobal:\n scrape_interval: 5s\n scrape_timeout: 5s\n evaluation_interval: 5s\n\nalerting:\n alertmanagers:\n - consul_sd_configs:\n - server: '{{ env \"NOMAD_IP_prometheus\" }}:8500'\n services: [ 'alertmanager' ]\n\nrule_files:\n - 'alerts.yml'\n\nscrape_configs:\n\n - job_name: 'Nomad Cluster'\n consul_sd_configs:\n - server: '{{ env \"NOMAD_IP_prometheus\" }}:8500'\n services: [ 'nomad-client', 'nomad' ]\n relabel_configs:\n - source_labels: [__meta_consul_tags]\n regex: '(.*)http(.*)'\n action: keep\n metrics_path: /v1/metrics\n params:\n format: [ 'prometheus' ]\n\n - job_name: 'Consul Cluster'\n static_configs:\n - targets: [ '10.30.51.30:8500', '10.30.51.32:8500', '10.30.51.33:8500' ]\n metrics_path: /v1/agent/metrics\n params:\n format: [ 'prometheus' ]\n\n - job_name: 'Alertmanager'\n consul_sd_configs:\n - server: '{{ env \"NOMAD_IP_prometheus\" }}:8500'\n services: [ 'alertmanager' ]\n\n - job_name: 'Blackbox Exporter (icmp)'\n static_configs:\n - targets: [ 'gerrit.fd.io' ]\n - targets: [ 'jenkins.fd.io' ]\n - targets: [ '10.30.51.32' ]\n params:\n module: [ 'icmp_v4' ]\n relabel_configs:\n - source_labels: [__address__]\n target_label: __param_target\n - source_labels: [__param_target]\n target_label: instance\n - target_label: __address__\n replacement: localhost:9115\n metrics_path: /probe\n\n - job_name: 'Blackbox Exporter (http)'\n static_configs:\n - targets: [ 'gerrit.fd.io' ]\n - targets: [ 'jenkins.fd.io' ]\n params:\n module: [ 'http_2xx' ]\n relabel_configs:\n - source_labels: [__address__]\n target_label: __param_target\n - source_labels: [__param_target]\n target_label: instance\n - target_label: __address__\n replacement: localhost:9115\n metrics_path: /probe\n\n - job_name: 'cAdvisor Exporter'\n consul_sd_configs:\n - server: '{{ env \"NOMAD_IP_prometheus\" }}:8500'\n services: [ 'cadvisorexporter' ]\n\n - job_name: 'Grafana'\n consul_sd_configs:\n - server: '{{ env \"NOMAD_IP_prometheus\" }}:8500'\n services: [ 'grafana' ]\n\n - job_name: 'Node Exporter'\n consul_sd_configs:\n - server: '{{ env \"NOMAD_IP_prometheus\" }}:8500'\n services: [ 'nodeexporter' ]\n\n - job_name: 'Prometheus'\n consul_sd_configs:\n - server: '{{ env \"NOMAD_IP_prometheus\" }}:8500'\n services: [ 'prometheus' ]\n\n - job_name: 'Minio'\n bearer_token: eyJhbGciOiJIUzUxMiIsInR5cCI6IkpXVCJ9.eyJleHAiOjQ3NjQ1ODEzMzcsImlzcyI6InByb21ldGhldXMiLCJzdWIiOiJtaW5pbyJ9.oeTw3EIaiFmlDikrHXWiWXMH2vxLfDLkfjEC7G2N3M_keH_xyA_l2ofLLNYtopa_3GCEZnxLQdPuFZrmgpkDWg\n consul_sd_configs:\n - server: '{{ env \"NOMAD_IP_prometheus\" }}:8500'\n services: [ 'storage' ]\n metrics_path: /minio/prometheus/metrics\nEOH\n }\n\n # The service stanza instructs Nomad to register a service with Consul.\n #\n # For more information and examples on the \"task\" stanza, please see\n # the online documentation at:\n #\n # https://www.nomadproject.io/docs/job-specification/service\n #\n service {\n name = \"prometheus\"\n port = \"prometheus\"\n tags = [ \"prometheus${NOMAD_ALLOC_INDEX}\" ]\n check {\n name = \"Prometheus Check Live\"\n type = \"http\"\n path = \"/-/healthy\"\n interval = \"10s\"\n timeout = \"2s\"\n }\n }\n\n # The \"resources\" stanza describes the requirements a task needs to\n # execute. Resource requirements include memory, network, cpu, and more.\n # This ensures the task will execute on a machine that contains enough\n # resource capacity.\n #\n # For more information and examples on the \"resources\" stanza, please see\n # the online documentation at:\n #\n # https://www.nomadproject.io/docs/job-specification/resources\n #\n resources {\n cpu = 2000\n memory = 8192\n # The network stanza specifies the networking requirements for the task\n # group, including the network mode and port allocations. When scheduling\n # jobs in Nomad they are provisioned across your fleet of machines along\n # with other jobs and services. Because you don't know in advance what host\n # your job will be provisioned on, Nomad will provide your tasks with\n # network configuration when they start up.\n #\n # For more information and examples on the \"template\" stanza, please see\n # the online documentation at:\n #\n # https://www.nomadproject.io/docs/job-specification/network\n #\n network {\n port \"prometheus\" {\n static = 9090\n }\n }\n }\n }\n }\n}", + "jobspec": "job \"prod-prometheus\" {\n # The \"region\" parameter specifies the region in which to execute the job.\n # If omitted, this inherits the default region name of \"global\".\n # region = \"global\"\n #\n # The \"datacenters\" parameter specifies the list of datacenters which should\n # be considered when placing this task. This must be provided.\n datacenters = \"yul1\"\n\n # The \"type\" parameter controls the type of job, which impacts the scheduler's\n # decision on placement. This configuration is optional and defaults to\n # \"service\". For a full list of job types and their differences, please see\n # the online documentation.\n #\n # For more information, please see the online documentation at:\n #\n # https://www.nomadproject.io/docs/jobspec/schedulers\n #\n type = \"service\"\n\n update {\n # The \"max_parallel\" parameter specifies the maximum number of updates to\n # perform in parallel. In this case, this specifies to update a single task\n # at a time.\n max_parallel = 1\n\n health_check = \"checks\"\n\n # The \"min_healthy_time\" parameter specifies the minimum time the allocation\n # must be in the healthy state before it is marked as healthy and unblocks\n # further allocations from being updated.\n min_healthy_time = \"10s\"\n\n # The \"healthy_deadline\" parameter specifies the deadline in which the\n # allocation must be marked as healthy after which the allocation is\n # automatically transitioned to unhealthy. Transitioning to unhealthy will\n # fail the deployment and potentially roll back the job if \"auto_revert\" is\n # set to true.\n healthy_deadline = \"3m\"\n\n # The \"progress_deadline\" parameter specifies the deadline in which an\n # allocation must be marked as healthy. The deadline begins when the first\n # allocation for the deployment is created and is reset whenever an allocation\n # as part of the deployment transitions to a healthy state. If no allocation\n # transitions to the healthy state before the progress deadline, the\n # deployment is marked as failed.\n progress_deadline = \"10m\"\n\n\n # The \"canary\" parameter specifies that changes to the job that would result\n # in destructive updates should create the specified number of canaries\n # without stopping any previous allocations. Once the operator determines the\n # canaries are healthy, they can be promoted which unblocks a rolling update\n # of the remaining allocations at a rate of \"max_parallel\".\n #\n # Further, setting \"canary\" equal to the count of the task group allows\n # blue/green deployments. When the job is updated, a full set of the new\n # version is deployed and upon promotion the old version is stopped.\n canary = 1\n\n # Specifies if the job should auto-promote to the canary version when all\n # canaries become healthy during a deployment. Defaults to false which means\n # canaries must be manually updated with the nomad deployment promote\n # command.\n auto_promote = true\n\n # The \"auto_revert\" parameter specifies if the job should auto-revert to the\n # last stable job on deployment failure. A job is marked as stable if all the\n # allocations as part of its deployment were marked healthy.\n auto_revert = true\n\n }\n\n # The \"group\" stanza defines a series of tasks that should be co-located on\n # the same Nomad client. Any task within a group will be placed on the same\n # client.\n #\n # For more information and examples on the \"group\" stanza, please see\n # the online documentation at:\n #\n # https://www.nomadproject.io/docs/job-specification/group\n #\n group \"prod-group1-prometheus\" {\n # The \"count\" parameter specifies the number of the task groups that should\n # be running under this group. This value must be non-negative and defaults\n # to 1.\n count = 4\n\n # The volume stanza allows the group to specify that it requires a given\n # volume from the cluster.\n #\n # For more information and examples on the \"volume\" stanza, please see\n # the online documentation at:\n #\n # https://www.nomadproject.io/docs/job-specification/volume\n #\n \n volume \"prod-volume1-prometheus\" {\n type = \"host\"\n read_only = false\n source = \"prod-volume-data1-1\"\n }\n \n\n # The constraint allows restricting the set of eligible nodes. Constraints\n # may filter on attributes or client metadata.\n #\n # For more information and examples on the \"volume\" stanza, please see\n # the online documentation at:\n #\n # https://www.nomadproject.io/docs/job-specification/constraint\n #\n constraint {\n attribute = \"${attr.cpu.arch}\"\n operator = \"!=\"\n value = \"arm64\"\n }\n\n # The \"task\" stanza creates an individual unit of work, such as a Docker\n # container, web application, or batch processing.\n #\n # For more information and examples on the \"task\" stanza, please see\n # the online documentation at:\n #\n # https://www.nomadproject.io/docs/job-specification/task\n #\n task \"prod-task1-prometheus\" {\n # The \"driver\" parameter specifies the task driver that should be used to\n # run the task.\n driver = \"exec\"\n\n \n volume_mount {\n volume = \"prod-volume1-prometheus\"\n destination = \"/data/\"\n read_only = false\n }\n \n\n \n\n # The \"config\" stanza specifies the driver configuration, which is passed\n # directly to the driver to start the task. The details of configurations\n # are specific to each driver, so please see specific driver\n # documentation for more information.\n config {\n command = \"local/prometheus-2.24.0.linux-amd64/prometheus\"\n args = [\n \"--config.file=secrets/prometheus.yml\",\n \"--storage.tsdb.path=/data/prometheus/\",\n \"--storage.tsdb.retention.time=15d\"\n ]\n }\n\n # The artifact stanza instructs Nomad to fetch and unpack a remote resource,\n # such as a file, tarball, or binary. Nomad downloads artifacts using the\n # popular go-getter library, which permits downloading artifacts from a\n # variety of locations using a URL as the input source.\n #\n # For more information and examples on the \"artifact\" stanza, please see\n # the online documentation at:\n #\n # https://www.nomadproject.io/docs/job-specification/artifact\n #\n artifact {\n source = \"https://github.com/prometheus/prometheus/releases/download/v2.24.0/prometheus-2.24.0.linux-amd64.tar.gz\"\n }\n\n # The \"template\" stanza instructs Nomad to manage a template, such as\n # a configuration file or script. This template can optionally pull data\n # from Consul or Vault to populate runtime configuration data.\n #\n # For more information and examples on the \"template\" stanza, please see\n # the online documentation at:\n #\n # https://www.nomadproject.io/docs/job-specification/template\n #\n template {\n change_mode = \"noop\"\n change_signal = \"SIGINT\"\n destination = \"secrets/alerts.yml\"\n left_delimiter = \"{{{\"\n right_delimiter = \"}}}\"\n data = \u003c\u003cEOH\n---\ngroups:\n- name: \"Consul\"\n rules:\n - alert: ConsulServiceHealthcheckFailed\n expr: consul_catalog_service_node_healthy == 0\n for: 0m\n labels:\n severity: critical\n annotations:\n summary: \"Consul service healthcheck failed (instance {{ $labels.instance }}).\"\n description: \"Service: `{{ $labels.service_name }}` Healthcheck: `{{ $labels.service_id }}`.\"\n - alert: ConsulMissingMasterNode\n expr: consul_raft_peers \u003c 3\n for: 0m\n labels:\n severity: critical\n annotations:\n summary: \"Consul missing master node (instance {{ $labels.instance }}).\"\n description: \"Numbers of consul raft peers should be 3, in order to preserve quorum.\"\n - alert: ConsulAgentUnhealthy\n expr: consul_health_node_status{status=\"critical\"} == 1\n for: 0m\n labels:\n severity: critical\n annotations:\n summary: \"Consul agent unhealthy (instance {{ $labels.instance }}).\"\n description: \"A Consul agent is down.\"\n- name: \"Hosts\"\n rules:\n - alert: NodeDown\n expr: up == 0\n for: 0m\n labels:\n severity: critical\n annotations:\n summary: \"Prometheus target missing (instance {{ $labels.instance }}).\"\n description: \"A Prometheus target has disappeared. An exporter might be crashed.\"\n - alert: HostHighCpuLoad\n expr: 100 - (avg by(instance) (rate(node_cpu_seconds_total{mode=\"idle\"}[5m])) * 100) \u003e 95\n for: 0m\n labels:\n severity: warning\n annotations:\n summary: \"Host high CPU load (instance {{ $labels.instance }}).\"\n description: \"CPU load is \u003e 95%.\"\n - alert: HostOutOfMemory\n expr: node_memory_MemAvailable_bytes / node_memory_MemTotal_bytes * 100 \u003c 10\n for: 2m\n labels:\n severity: warning\n annotations:\n summary: \"Host out of memory (instance {{ $labels.instance }}).\"\n description: \"Node memory is filling up (\u003c 10% left).\"\n - alert: HostOomKillDetected\n expr: increase(node_vmstat_oom_kill[1m]) \u003e 0\n for: 0m\n labels:\n severity: warning\n annotations:\n summary: \"Host OOM kill detected (instance {{ $labels.instance }}).\"\n description: \"OOM kill detected.\"\n - alert: HostMemoryUnderMemoryPressure\n expr: rate(node_vmstat_pgmajfault[1m]) \u003e 1000\n for: 2m\n labels:\n severity: warning\n annotations:\n summary: \"Host memory under memory pressure (instance {{ $labels.instance }}).\"\n description: \"The node is under heavy memory pressure. High rate of major page faults.\"\n - alert: HostOutOfDiskSpace\n expr: (node_filesystem_avail_bytes * 100) / node_filesystem_size_bytes \u003c 10 and ON (instance, device, mountpoint) node_filesystem_readonly == 0\n for: 2m\n labels:\n severity: warning\n annotations:\n summary: \"Host out of disk space (instance {{ $labels.instance }}).\"\n description: \"Disk is almost full (\u003c 10% left).\"\n - alert: HostRaidDiskFailure\n expr: node_md_disks{state=\"failed\"} \u003e 0\n for: 2m\n labels:\n severity: warning\n annotations:\n summary: \"Host RAID disk failure (instance {{ $labels.instance }}).\"\n description: \"At least one device in RAID array on {{ $labels.instance }} failed. Array {{ $labels.md_device }} needs attention and possibly a disk swap.\"\n - alert: HostConntrackLimit\n expr: node_nf_conntrack_entries / node_nf_conntrack_entries_limit \u003e 0.8\n for: 5m\n labels:\n severity: warning\n annotations:\n summary: \"Host conntrack limit (instance {{ $labels.instance }}).\"\n description: \"The number of conntrack is approching limit.\"\n - alert: HostNetworkInterfaceSaturated\n expr: (rate(node_network_receive_bytes_total{device!~\"^tap.*\"}[1m]) + rate(node_network_transmit_bytes_total{device!~\"^tap.*\"}[1m])) / node_network_speed_bytes{device!~\"^tap.*\"} \u003e 0.8\n for: 1m\n labels:\n severity: warning\n annotations:\n summary: \"Host Network Interface Saturated (instance {{ $labels.instance }}).\"\n description: \"The network interface {{ $labels.interface }} on {{ $labels.instance }} is getting overloaded.\"\n - alert: HostSystemdServiceCrashed\n expr: node_systemd_unit_state{state=\"failed\"} == 1\n for: 0m\n labels:\n severity: warning\n annotations:\n summary: \"Host SystemD service crashed (instance {{ $labels.instance }}).\"\n description: \"SystemD service crashed.\"\n - alert: HostEdacCorrectableErrorsDetected\n expr: increase(node_edac_correctable_errors_total[1m]) \u003e 0\n for: 0m\n labels:\n severity: info\n annotations:\n summary: \"Host EDAC Correctable Errors detected (instance {{ $labels.instance }}).\"\n description: '{{ $labels.instance }} has had {{ printf \"%.0f\" $value }} correctable memory errors reported by EDAC in the last 5 minutes.'\n - alert: HostEdacUncorrectableErrorsDetected\n expr: node_edac_uncorrectable_errors_total \u003e 0\n for: 0m\n labels:\n severity: warning\n annotations:\n summary: \"Host EDAC Uncorrectable Errors detected (instance {{ $labels.instance }}).\"\n description: '{{ $labels.instance }} has had {{ printf \"%.0f\" $value }} uncorrectable memory errors reported by EDAC in the last 5 minutes.'\n- name: \"Min.io\"\n rules:\n - alert: MinioDiskOffline\n expr: minio_offline_disks \u003e 0\n for: 0m\n labels:\n severity: critical\n annotations:\n summary: \"Minio disk offline (instance {{ $labels.instance }})\"\n description: \"Minio disk is offline.\"\n - alert: MinioStorageSpaceExhausted\n expr: minio_disk_storage_free_bytes / 1024 / 1024 / 1024 \u003c 10\n for: 2m\n labels:\n severity: warning\n annotations:\n summary: \"Minio storage space exhausted (instance {{ $labels.instance }}).\"\n description: \"Minio storage space is low (\u003c 10 GB).\"\n- name: \"Prometheus\"\n rules:\n - alert: PrometheusConfigurationReloadFailure\n expr: prometheus_config_last_reload_successful != 1\n for: 0m\n labels:\n severity: warning\n annotations:\n summary: \"Prometheus configuration reload failure (instance {{ $labels.instance }}).\"\n description: \"Prometheus configuration reload error.\"\n - alert: PrometheusTooManyRestarts\n expr: changes(process_start_time_seconds{job=~\"prometheus|pushgateway|alertmanager\"}[15m]) \u003e 2\n for: 0m\n labels:\n severity: warning\n annotations:\n summary: \"Prometheus too many restarts (instance {{ $labels.instance }}).\"\n description: \"Prometheus has restarted more than twice in the last 15 minutes. It might be crashlooping.\"\n - alert: PrometheusAlertmanagerConfigurationReloadFailure\n expr: alertmanager_config_last_reload_successful != 1\n for: 0m\n labels:\n severity: warning\n annotations:\n summary: \"Prometheus AlertManager configuration reload failure (instance {{ $labels.instance }}).\"\n description: \"AlertManager configuration reload error.\"\n - alert: PrometheusRuleEvaluationFailures\n expr: increase(prometheus_rule_evaluation_failures_total[3m]) \u003e 0\n for: 0m\n labels:\n severity: critical\n annotations:\n summary: \"Prometheus rule evaluation failures (instance {{ $labels.instance }}).\"\n description: \"Prometheus encountered {{ $value }} rule evaluation failures, leading to potentially ignored alerts.\"\n - alert: PrometheusTargetScrapingSlow\n expr: prometheus_target_interval_length_seconds{quantile=\"0.9\"} \u003e 60\n for: 5m\n labels:\n severity: warning\n annotations:\n summary: \"Prometheus target scraping slow (instance {{ $labels.instance }}).\"\n description: \"Prometheus is scraping exporters slowly.\"\n - alert: PrometheusTsdbCompactionsFailed\n expr: increase(prometheus_tsdb_compactions_failed_total[1m]) \u003e 0\n for: 0m\n labels:\n severity: critical\n annotations:\n summary: \"Prometheus TSDB compactions failed (instance {{ $labels.instance }}).\"\n description: \"Prometheus encountered {{ $value }} TSDB compactions failures.\"\n - alert: PrometheusTsdbHeadTruncationsFailed\n expr: increase(prometheus_tsdb_head_truncations_failed_total[1m]) \u003e 0\n for: 0m\n labels:\n severity: critical\n annotations:\n summary: \"Prometheus TSDB head truncations failed (instance {{ $labels.instance }}).\"\n description: \"Prometheus encountered {{ $value }} TSDB head truncation failures.\"\n - alert: PrometheusTsdbWalCorruptions\n expr: increase(prometheus_tsdb_wal_corruptions_total[1m]) \u003e 0\n for: 0m\n labels:\n severity: critical\n annotations:\n summary: \"Prometheus TSDB WAL corruptions (instance {{ $labels.instance }}).\"\n description: \"Prometheus encountered {{ $value }} TSDB WAL corruptions.\"\n - alert: PrometheusTsdbWalTruncationsFailed\n expr: increase(prometheus_tsdb_wal_truncations_failed_total[1m]) \u003e 0\n for: 0m\n labels:\n severity: critical\n annotations:\n summary: \"Prometheus TSDB WAL truncations failed (instance {{ $labels.instance }}).\"\n description: \"Prometheus encountered {{ $value }} TSDB WAL truncation failures.\"\nEOH\n }\n\n template {\n change_mode = \"noop\"\n change_signal = \"SIGINT\"\n destination = \"secrets/prometheus.yml\"\n data = \u003c\u003cEOH\n---\nglobal:\n scrape_interval: 5s\n scrape_timeout: 5s\n evaluation_interval: 5s\n\nalerting:\n alertmanagers:\n - consul_sd_configs:\n - server: '{{ env \"NOMAD_IP_prometheus\" }}:8500'\n services: [ 'alertmanager' ]\n\nrule_files:\n - 'alerts.yml'\n\nscrape_configs:\n\n - job_name: 'Nomad Cluster'\n consul_sd_configs:\n - server: '{{ env \"NOMAD_IP_prometheus\" }}:8500'\n services: [ 'nomad-client', 'nomad' ]\n relabel_configs:\n - source_labels: [__meta_consul_tags]\n regex: '(.*)http(.*)'\n action: keep\n metrics_path: /v1/metrics\n params:\n format: [ 'prometheus' ]\n\n - job_name: 'Consul Cluster'\n static_configs:\n - targets: [ '10.30.51.28:8500' ]\n - targets: [ '10.30.51.29:8500' ]\n - targets: [ '10.30.51.30:8500' ]\n - targets: [ '10.30.51.32:8500' ]\n - targets: [ '10.30.51.33:8500' ]\n - targets: [ '10.30.51.34:8500' ]\n - targets: [ '10.30.51.35:8500' ]\n - targets: [ '10.30.51.39:8500' ]\n - targets: [ '10.30.51.40:8500' ]\n - targets: [ '10.30.51.50:8500' ]\n - targets: [ '10.30.51.51:8500' ]\n - targets: [ '10.30.51.65:8500' ]\n - targets: [ '10.30.51.66:8500' ]\n - targets: [ '10.30.51.67:8500' ]\n - targets: [ '10.30.51.68:8500' ]\n - targets: [ '10.30.51.70:8500' ]\n - targets: [ '10.30.51.71:8500' ]\n - targets: [ '10.32.8.14:8500' ]\n - targets: [ '10.32.8.15:8500' ]\n - targets: [ '10.32.8.16:8500' ]\n - targets: [ '10.32.8.17:8500' ]\n metrics_path: /v1/agent/metrics\n params:\n format: [ 'prometheus' ]\n\n - job_name: 'Blackbox Exporter (icmp)'\n static_configs:\n - targets: [ 'gerrit.fd.io' ]\n - targets: [ 'jenkins.fd.io' ]\n - targets: [ '10.30.51.32' ]\n params:\n module: [ 'icmp_v4' ]\n relabel_configs:\n - source_labels: [__address__]\n target_label: __param_target\n - source_labels: [__param_target]\n target_label: instance\n - target_label: __address__\n replacement: localhost:9115\n metrics_path: /probe\n\n - job_name: 'Blackbox Exporter (http)'\n static_configs:\n - targets: [ 'gerrit.fd.io' ]\n - targets: [ 'jenkins.fd.io' ]\n params:\n module: [ 'http_2xx' ]\n relabel_configs:\n - source_labels: [__address__]\n target_label: __param_target\n - source_labels: [__param_target]\n target_label: instance\n - target_label: __address__\n replacement: localhost:9115\n metrics_path: /probe\n\n - job_name: 'cAdvisor Exporter'\n static_configs:\n - targets: [ '10.30.51.28:8080' ]\n - targets: [ '10.30.51.29:8080' ]\n - targets: [ '10.30.51.30:8080' ]\n #- targets: [ '10.30.51.32:8080' ]\n - targets: [ '10.30.51.33:8080' ]\n - targets: [ '10.30.51.34:8080' ]\n - targets: [ '10.30.51.35:8080' ]\n - targets: [ '10.30.51.39:8080' ]\n - targets: [ '10.30.51.40:8080' ]\n - targets: [ '10.30.51.50:8080' ]\n - targets: [ '10.30.51.51:8080' ]\n - targets: [ '10.30.51.65:8080' ]\n - targets: [ '10.30.51.66:8080' ]\n - targets: [ '10.30.51.67:8080' ]\n - targets: [ '10.30.51.68:8080' ]\n - targets: [ '10.30.51.70:8080' ]\n - targets: [ '10.30.51.71:8080' ]\n - targets: [ '10.32.8.14:8080' ]\n - targets: [ '10.32.8.15:8080' ]\n - targets: [ '10.32.8.16:8080' ]\n - targets: [ '10.32.8.17:8080' ]\n\n - job_name: 'Node Exporter'\n static_configs:\n - targets: [ '10.30.51.28:9100' ]\n - targets: [ '10.30.51.29:9100' ]\n - targets: [ '10.30.51.30:9100' ]\n - targets: [ '10.30.51.32:9100' ]\n - targets: [ '10.30.51.33:9100' ]\n - targets: [ '10.30.51.34:9100' ]\n - targets: [ '10.30.51.35:9100' ]\n - targets: [ '10.30.51.39:9100' ]\n - targets: [ '10.30.51.40:9100' ]\n - targets: [ '10.30.51.50:9100' ]\n - targets: [ '10.30.51.51:9100' ]\n - targets: [ '10.30.51.65:9100' ]\n - targets: [ '10.30.51.66:9100' ]\n - targets: [ '10.30.51.67:9100' ]\n - targets: [ '10.30.51.68:9100' ]\n - targets: [ '10.30.51.70:9100' ]\n - targets: [ '10.30.51.71:9100' ]\n - targets: [ '10.32.8.14:9100' ]\n - targets: [ '10.32.8.15:9100' ]\n - targets: [ '10.32.8.16:9100' ]\n - targets: [ '10.32.8.17:9100' ]\n\n - job_name: 'Alertmanager'\n consul_sd_configs:\n - server: '{{ env \"NOMAD_IP_prometheus\" }}:8500'\n services: [ 'alertmanager' ]\n\n - job_name: 'Grafana'\n consul_sd_configs:\n - server: '{{ env \"NOMAD_IP_prometheus\" }}:8500'\n services: [ 'grafana' ]\n\n - job_name: 'Prometheus'\n consul_sd_configs:\n - server: '{{ env \"NOMAD_IP_prometheus\" }}:8500'\n services: [ 'prometheus' ]\n\n - job_name: 'Minio'\n bearer_token: eyJhbGciOiJIUzUxMiIsInR5cCI6IkpXVCJ9.eyJleHAiOjQ3NjQ1ODEzMzcsImlzcyI6InByb21ldGhldXMiLCJzdWIiOiJtaW5pbyJ9.oeTw3EIaiFmlDikrHXWiWXMH2vxLfDLkfjEC7G2N3M_keH_xyA_l2ofLLNYtopa_3GCEZnxLQdPuFZrmgpkDWg\n consul_sd_configs:\n - server: '{{ env \"NOMAD_IP_prometheus\" }}:8500'\n services: [ 'storage' ]\n metrics_path: /minio/prometheus/metrics\nEOH\n }\n\n # The service stanza instructs Nomad to register a service with Consul.\n #\n # For more information and examples on the \"task\" stanza, please see\n # the online documentation at:\n #\n # https://www.nomadproject.io/docs/job-specification/service\n #\n service {\n name = \"prometheus\"\n port = \"prometheus\"\n tags = [ \"prometheus${NOMAD_ALLOC_INDEX}\" ]\n check {\n name = \"Prometheus Check Live\"\n type = \"http\"\n path = \"/-/healthy\"\n interval = \"10s\"\n timeout = \"2s\"\n }\n }\n\n # The \"resources\" stanza describes the requirements a task needs to\n # execute. Resource requirements include memory, network, cpu, and more.\n # This ensures the task will execute on a machine that contains enough\n # resource capacity.\n #\n # For more information and examples on the \"resources\" stanza, please see\n # the online documentation at:\n #\n # https://www.nomadproject.io/docs/job-specification/resources\n #\n resources {\n cpu = 2000\n memory = 8192\n # The network stanza specifies the networking requirements for the task\n # group, including the network mode and port allocations. When scheduling\n # jobs in Nomad they are provisioned across your fleet of machines along\n # with other jobs and services. Because you don't know in advance what host\n # your job will be provisioned on, Nomad will provide your tasks with\n # network configuration when they start up.\n #\n # For more information and examples on the \"template\" stanza, please see\n # the online documentation at:\n #\n # https://www.nomadproject.io/docs/job-specification/network\n #\n network {\n port \"prometheus\" {\n static = 9090\n }\n }\n }\n }\n }\n}", "json": null, - "modify_index": "7147932", + "modify_index": "7201194", "name": "prod-prometheus", "namespace": "default", "policy_override": null, @@ -758,10 +610,10 @@ "schema_version": 0, "attributes": { "allocation_ids": [ + "0239788e-a5eb-b54d-0cf2-2404b3ec7c53", "ecfcd9ad-b959-0fa4-c1ec-8b82195830f1", "190f5699-0d10-7f85-ac68-96927702070b", - "2fbbef45-f00a-1367-08c4-c2239de9e78d", - "dd57bc83-2229-d242-1caf-b743f186e7a2" + "2fbbef45-f00a-1367-08c4-c2239de9e78d" ], "datacenters": [ "yul1" diff --git a/terraform-ci-infra/1n_nmd/terraform.tfstate.backup b/terraform-ci-infra/1n_nmd/terraform.tfstate.backup index 5d33863ff0..f40e2c2c37 100644 --- a/terraform-ci-infra/1n_nmd/terraform.tfstate.backup +++ b/terraform-ci-infra/1n_nmd/terraform.tfstate.backup @@ -1,7 +1,7 @@ { "version": 4, "terraform_version": "0.14.5", - "serial": 1030, + "serial": 1037, "lineage": "e4e7f30a-652d-7a31-e31c-5e3a3388c9b9", "outputs": {}, "resources": [ @@ -16,9 +16,9 @@ "schema_version": 0, "attributes": { "filename": null, - "id": "ae09d392c808970a05f2423cd8dbf158f6cb6d3ae50260cd7d16f1575ce43871", - "rendered": "job \"prod-alertmanager\" {\n # The \"region\" parameter specifies the region in which to execute the job.\n # If omitted, this inherits the default region name of \"global\".\n # region = \"global\"\n #\n # The \"datacenters\" parameter specifies the list of datacenters which should\n # be considered when placing this task. This must be provided.\n datacenters = \"yul1\"\n\n # The \"type\" parameter controls the type of job, which impacts the scheduler's\n # decision on placement. This configuration is optional and defaults to\n # \"service\". For a full list of job types and their differences, please see\n # the online documentation.\n #\n # For more information, please see the online documentation at:\n #\n # https://www.nomadproject.io/docs/jobspec/schedulers\n #\n type = \"service\"\n\n update {\n # The \"max_parallel\" parameter specifies the maximum number of updates to\n # perform in parallel. In this case, this specifies to update a single task\n # at a time.\n max_parallel = 1\n\n health_check = \"checks\"\n\n # The \"min_healthy_time\" parameter specifies the minimum time the allocation\n # must be in the healthy state before it is marked as healthy and unblocks\n # further allocations from being updated.\n min_healthy_time = \"10s\"\n\n # The \"healthy_deadline\" parameter specifies the deadline in which the\n # allocation must be marked as healthy after which the allocation is\n # automatically transitioned to unhealthy. Transitioning to unhealthy will\n # fail the deployment and potentially roll back the job if \"auto_revert\" is\n # set to true.\n healthy_deadline = \"3m\"\n\n # The \"progress_deadline\" parameter specifies the deadline in which an\n # allocation must be marked as healthy. The deadline begins when the first\n # allocation for the deployment is created and is reset whenever an allocation\n # as part of the deployment transitions to a healthy state. If no allocation\n # transitions to the healthy state before the progress deadline, the\n # deployment is marked as failed.\n progress_deadline = \"10m\"\n\n\n # The \"canary\" parameter specifies that changes to the job that would result\n # in destructive updates should create the specified number of canaries\n # without stopping any previous allocations. Once the operator determines the\n # canaries are healthy, they can be promoted which unblocks a rolling update\n # of the remaining allocations at a rate of \"max_parallel\".\n #\n # Further, setting \"canary\" equal to the count of the task group allows\n # blue/green deployments. When the job is updated, a full set of the new\n # version is deployed and upon promotion the old version is stopped.\n canary = 1\n\n # Specifies if the job should auto-promote to the canary version when all\n # canaries become healthy during a deployment. Defaults to false which means\n # canaries must be manually updated with the nomad deployment promote\n # command.\n auto_promote = true\n\n # The \"auto_revert\" parameter specifies if the job should auto-revert to the\n # last stable job on deployment failure. A job is marked as stable if all the\n # allocations as part of its deployment were marked healthy.\n auto_revert = true\n\n }\n\n # The \"group\" stanza defines a series of tasks that should be co-located on\n # the same Nomad client. Any task within a group will be placed on the same\n # client.\n #\n # For more information and examples on the \"group\" stanza, please see\n # the online documentation at:\n #\n # https://www.nomadproject.io/docs/job-specification/group\n #\n group \"prod-group1-alertmanager\" {\n # The \"count\" parameter specifies the number of the task groups that should\n # be running under this group. This value must be non-negative and defaults\n # to 1.\n count = 1\n\n # The constraint allows restricting the set of eligible nodes. Constraints\n # may filter on attributes or client metadata.\n #\n # For more information and examples on the \"volume\" stanza, please see\n # the online documentation at:\n #\n # https://www.nomadproject.io/docs/job-specification/constraint\n #\n constraint {\n attribute = \"${attr.cpu.arch}\"\n operator = \"!=\"\n value = \"arm64\"\n }\n\n # The \"task\" stanza creates an individual unit of work, such as a Docker\n # container, web application, or batch processing.\n #\n # For more information and examples on the \"task\" stanza, please see\n # the online documentation at:\n #\n # https://www.nomadproject.io/docs/job-specification/task\n #\n task \"prod-task1-alertmanager\" {\n # The \"driver\" parameter specifies the task driver that should be used to\n # run the task.\n driver = \"exec\"\n\n \n\n # The \"config\" stanza specifies the driver configuration, which is passed\n # directly to the driver to start the task. The details of configurations\n # are specific to each driver, so please see specific driver\n # documentation for more information.\n config {\n command = \"local/alertmanager-0.21.0.linux-amd64/alertmanager\"\n args = [\n \"--config.file=secrets/alertmanager.yml\"\n ]\n }\n\n # The artifact stanza instructs Nomad to fetch and unpack a remote resource,\n # such as a file, tarball, or binary. Nomad downloads artifacts using the\n # popular go-getter library, which permits downloading artifacts from a\n # variety of locations using a URL as the input source.\n #\n # For more information and examples on the \"artifact\" stanza, please see\n # the online documentation at:\n #\n # https://www.nomadproject.io/docs/job-specification/artifact\n #\n artifact {\n source = \"https://github.com/prometheus/alertmanager/releases/download/v0.21.0/alertmanager-0.21.0.linux-amd64.tar.gz\"\n }\n\n # The \"template\" stanza instructs Nomad to manage a template, such as\n # a configuration file or script. This template can optionally pull data\n # from Consul or Vault to populate runtime configuration data.\n #\n # For more information and examples on the \"template\" stanza, please see\n # the online documentation at:\n #\n # https://www.nomadproject.io/docs/job-specification/template\n #\n template {\n change_mode = \"noop\"\n change_signal = \"SIGINT\"\n destination = \"secrets/alertmanager.yml\"\n left_delimiter = \"{{{\"\n right_delimiter = \"}}}\"\n data = \u003c\u003cEOH\nglobal:\n # The API URL to use for Slack notifications.\n slack_api_url: 'https://hooks.slack.com/services/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'\n\n# The directory from which notification templates are read.\ntemplates:\n- '/etc/alertmanager/template/*.tmpl'\n\n#tls_config:\n# # CA certificate to validate the server certificate with.\n# ca_file: \u003cfilepath\u003e ]\n#\n# # Certificate and key files for client cert authentication to the server.\n# cert_file: \u003cfilepath\u003e\n# key_file: \u003cfilepath\u003e\n#\n# # ServerName extension to indicate the name of the server.\n# # http://tools.ietf.org/html/rfc4366#section-3.1\n# server_name: \u003cstring\u003e\n#\n# # Disable validation of the server certificate.\n# insecure_skip_verify: true\n\n# The root route on which each incoming alert enters.\nroute:\n receiver: 'default-receiver'\n\n # The labels by which incoming alerts are grouped together. For example,\n # multiple alerts coming in for cluster=A and alertname=LatencyHigh would\n # be batched into a single group.\n #\n # To aggregate by all possible labels use '...' as the sole label name.\n # This effectively disables aggregation entirely, passing through all\n # alerts as-is. This is unlikely to be what you want, unless you have\n # a very low alert volume or your upstream notification system performs\n # its own grouping. Example: group_by: [...]\n group_by: ['alertname', 'cluster', 'service']\n\n # When a new group of alerts is created by an incoming alert, wait at\n # least 'group_wait' to send the initial notification.\n # This way ensures that you get multiple alerts for the same group that start\n # firing shortly after another are batched together on the first\n # notification.\n group_wait: 30s\n\n # When the first notification was sent, wait 'group_interval' to send a batch\n # of new alerts that started firing for that group.\n group_interval: 5m\n\n # If an alert has successfully been sent, wait 'repeat_interval' to\n # resend them.\n repeat_interval: 3h\n\n # All the above attributes are inherited by all child routes and can\n # overwritten on each.\n # The child route trees.\n routes:\n # This routes performs a regular expression match on alert labels to\n # catch alerts that are related to a list of services.\n - match_re:\n service: .*\n receiver: default-receiver\n # The service has a sub-route for critical alerts, any alerts\n # that do not match, i.e. severity != critical, fall-back to the\n # parent node and are sent to 'team-X-mails'\n routes:\n - match:\n severity: critical\n receiver: 'default-receiver'\n\n# Inhibition rules allow to mute a set of alerts given that another alert is\n# firing.\n# We use this to mute any warning-level notifications if the same alert is\n# already critical.\ninhibit_rules:\n- source_match:\n severity: 'critical'\n target_match:\n severity: 'warning'\n # Apply inhibition if the alertname is the same.\n # CAUTION:\n # If all label names listed in `equal` are missing\n # from both the source and target alerts,\n # the inhibition rule will apply!\n equal: ['alertname', 'cluster', 'service']\n\nreceivers:\n- name: 'default-receiver'\n slack_configs:\n - channel: '#fdio-infra-monitoring'\n send_resolved: true\n icon_url: https://avatars3.githubusercontent.com/u/3380462\n title: |-\n [{{ .Status | toUpper }}{{ if eq .Status \"firing\" }}:{{ .Alerts.Firing | len }}{{ end }}] {{ .CommonLabels.alertname }} for {{ .CommonLabels.job }}\n {{- if gt (len .CommonLabels) (len .GroupLabels) -}}\n {{\" \"}}(\n {{- with .CommonLabels.Remove .GroupLabels.Names }}\n {{- range $index, $label := .SortedPairs -}}\n {{ if $index }}, {{ end }}\n {{- $label.Name }}=\"{{ $label.Value -}}\"\n {{- end }}\n {{- end -}}\n )\n {{- end }}\n text: \u003e-\n {{ range .Alerts -}}\n *Alert:* {{ .Annotations.summary }}{{ if .Labels.severity }} - `{{ .Labels.severity }}`{{ end }}\n\n *Description:* {{ .Annotations.description }}\n\n *Details:*\n {{ range .Labels.SortedPairs }} • *{{ .Name }}:* `{{ .Value }}`\n {{ end }}\n {{ end }}\nEOH\n }\n\n # The service stanza instructs Nomad to register a service with Consul.\n #\n # For more information and examples on the \"task\" stanza, please see\n # the online documentation at:\n #\n # https://www.nomadproject.io/docs/job-specification/service\n #\n service {\n name = \"alertmanager\"\n port = \"alertmanager\"\n tags = [ \"alertmanager${NOMAD_ALLOC_INDEX}\" ]\n check {\n name = \"Alertmanager Check Live\"\n type = \"http\"\n path = \"/-/healthy\"\n interval = \"10s\"\n timeout = \"2s\"\n }\n }\n\n # The \"resources\" stanza describes the requirements a task needs to\n # execute. Resource requirements include memory, network, cpu, and more.\n # This ensures the task will execute on a machine that contains enough\n # resource capacity.\n #\n # For more information and examples on the \"resources\" stanza, please see\n # the online documentation at:\n #\n # https://www.nomadproject.io/docs/job-specification/resources\n #\n resources {\n cpu = 1000\n memory = 1024\n # The network stanza specifies the networking requirements for the task\n # group, including the network mode and port allocations. When scheduling\n # jobs in Nomad they are provisioned across your fleet of machines along\n # with other jobs and services. Because you don't know in advance what host\n # your job will be provisioned on, Nomad will provide your tasks with\n # network configuration when they start up.\n #\n # For more information and examples on the \"template\" stanza, please see\n # the online documentation at:\n #\n # https://www.nomadproject.io/docs/job-specification/network\n #\n network {\n port \"alertmanager\" {\n static = 9093\n }\n }\n }\n }\n }\n}", - "template": "job \"${job_name}\" {\n # The \"region\" parameter specifies the region in which to execute the job.\n # If omitted, this inherits the default region name of \"global\".\n # region = \"global\"\n #\n # The \"datacenters\" parameter specifies the list of datacenters which should\n # be considered when placing this task. This must be provided.\n datacenters = \"${datacenters}\"\n\n # The \"type\" parameter controls the type of job, which impacts the scheduler's\n # decision on placement. This configuration is optional and defaults to\n # \"service\". For a full list of job types and their differences, please see\n # the online documentation.\n #\n # For more information, please see the online documentation at:\n #\n # https://www.nomadproject.io/docs/jobspec/schedulers\n #\n type = \"service\"\n\n update {\n # The \"max_parallel\" parameter specifies the maximum number of updates to\n # perform in parallel. In this case, this specifies to update a single task\n # at a time.\n max_parallel = 1\n\n health_check = \"checks\"\n\n # The \"min_healthy_time\" parameter specifies the minimum time the allocation\n # must be in the healthy state before it is marked as healthy and unblocks\n # further allocations from being updated.\n min_healthy_time = \"10s\"\n\n # The \"healthy_deadline\" parameter specifies the deadline in which the\n # allocation must be marked as healthy after which the allocation is\n # automatically transitioned to unhealthy. Transitioning to unhealthy will\n # fail the deployment and potentially roll back the job if \"auto_revert\" is\n # set to true.\n healthy_deadline = \"3m\"\n\n # The \"progress_deadline\" parameter specifies the deadline in which an\n # allocation must be marked as healthy. The deadline begins when the first\n # allocation for the deployment is created and is reset whenever an allocation\n # as part of the deployment transitions to a healthy state. If no allocation\n # transitions to the healthy state before the progress deadline, the\n # deployment is marked as failed.\n progress_deadline = \"10m\"\n\n%{ if use_canary }\n # The \"canary\" parameter specifies that changes to the job that would result\n # in destructive updates should create the specified number of canaries\n # without stopping any previous allocations. Once the operator determines the\n # canaries are healthy, they can be promoted which unblocks a rolling update\n # of the remaining allocations at a rate of \"max_parallel\".\n #\n # Further, setting \"canary\" equal to the count of the task group allows\n # blue/green deployments. When the job is updated, a full set of the new\n # version is deployed and upon promotion the old version is stopped.\n canary = 1\n\n # Specifies if the job should auto-promote to the canary version when all\n # canaries become healthy during a deployment. Defaults to false which means\n # canaries must be manually updated with the nomad deployment promote\n # command.\n auto_promote = true\n\n # The \"auto_revert\" parameter specifies if the job should auto-revert to the\n # last stable job on deployment failure. A job is marked as stable if all the\n # allocations as part of its deployment were marked healthy.\n auto_revert = true\n%{ endif }\n }\n\n # The \"group\" stanza defines a series of tasks that should be co-located on\n # the same Nomad client. Any task within a group will be placed on the same\n # client.\n #\n # For more information and examples on the \"group\" stanza, please see\n # the online documentation at:\n #\n # https://www.nomadproject.io/docs/job-specification/group\n #\n group \"prod-group1-${service_name}\" {\n # The \"count\" parameter specifies the number of the task groups that should\n # be running under this group. This value must be non-negative and defaults\n # to 1.\n count = ${group_count}\n\n # The constraint allows restricting the set of eligible nodes. Constraints\n # may filter on attributes or client metadata.\n #\n # For more information and examples on the \"volume\" stanza, please see\n # the online documentation at:\n #\n # https://www.nomadproject.io/docs/job-specification/constraint\n #\n constraint {\n attribute = \"$${attr.cpu.arch}\"\n operator = \"!=\"\n value = \"arm64\"\n }\n\n # The \"task\" stanza creates an individual unit of work, such as a Docker\n # container, web application, or batch processing.\n #\n # For more information and examples on the \"task\" stanza, please see\n # the online documentation at:\n #\n # https://www.nomadproject.io/docs/job-specification/task\n #\n task \"prod-task1-${service_name}\" {\n # The \"driver\" parameter specifies the task driver that should be used to\n # run the task.\n driver = \"exec\"\n\n %{ if use_vault_provider }\n vault {\n policies = \"${vault_kv_policy_name}\"\n }\n %{ endif }\n\n # The \"config\" stanza specifies the driver configuration, which is passed\n # directly to the driver to start the task. The details of configurations\n # are specific to each driver, so please see specific driver\n # documentation for more information.\n config {\n command = \"local/alertmanager-${version}.linux-amd64/alertmanager\"\n args = [\n \"--config.file=secrets/alertmanager.yml\"\n ]\n }\n\n # The artifact stanza instructs Nomad to fetch and unpack a remote resource,\n # such as a file, tarball, or binary. Nomad downloads artifacts using the\n # popular go-getter library, which permits downloading artifacts from a\n # variety of locations using a URL as the input source.\n #\n # For more information and examples on the \"artifact\" stanza, please see\n # the online documentation at:\n #\n # https://www.nomadproject.io/docs/job-specification/artifact\n #\n artifact {\n source = \"${url}\"\n }\n\n # The \"template\" stanza instructs Nomad to manage a template, such as\n # a configuration file or script. This template can optionally pull data\n # from Consul or Vault to populate runtime configuration data.\n #\n # For more information and examples on the \"template\" stanza, please see\n # the online documentation at:\n #\n # https://www.nomadproject.io/docs/job-specification/template\n #\n template {\n change_mode = \"noop\"\n change_signal = \"SIGINT\"\n destination = \"secrets/alertmanager.yml\"\n left_delimiter = \"{{{\"\n right_delimiter = \"}}}\"\n data = \u003c\u003cEOH\nglobal:\n # The API URL to use for Slack notifications.\n slack_api_url: '${slack_api_url}'\n\n# The directory from which notification templates are read.\ntemplates:\n- '/etc/alertmanager/template/*.tmpl'\n\n#tls_config:\n# # CA certificate to validate the server certificate with.\n# ca_file: \u003cfilepath\u003e ]\n#\n# # Certificate and key files for client cert authentication to the server.\n# cert_file: \u003cfilepath\u003e\n# key_file: \u003cfilepath\u003e\n#\n# # ServerName extension to indicate the name of the server.\n# # http://tools.ietf.org/html/rfc4366#section-3.1\n# server_name: \u003cstring\u003e\n#\n# # Disable validation of the server certificate.\n# insecure_skip_verify: true\n\n# The root route on which each incoming alert enters.\nroute:\n receiver: '${default_receiver}'\n\n # The labels by which incoming alerts are grouped together. For example,\n # multiple alerts coming in for cluster=A and alertname=LatencyHigh would\n # be batched into a single group.\n #\n # To aggregate by all possible labels use '...' as the sole label name.\n # This effectively disables aggregation entirely, passing through all\n # alerts as-is. This is unlikely to be what you want, unless you have\n # a very low alert volume or your upstream notification system performs\n # its own grouping. Example: group_by: [...]\n group_by: ['alertname', 'cluster', 'service']\n\n # When a new group of alerts is created by an incoming alert, wait at\n # least 'group_wait' to send the initial notification.\n # This way ensures that you get multiple alerts for the same group that start\n # firing shortly after another are batched together on the first\n # notification.\n group_wait: 30s\n\n # When the first notification was sent, wait 'group_interval' to send a batch\n # of new alerts that started firing for that group.\n group_interval: 5m\n\n # If an alert has successfully been sent, wait 'repeat_interval' to\n # resend them.\n repeat_interval: 3h\n\n # All the above attributes are inherited by all child routes and can\n # overwritten on each.\n # The child route trees.\n routes:\n # This routes performs a regular expression match on alert labels to\n # catch alerts that are related to a list of services.\n - match_re:\n service: .*\n receiver: ${default_receiver}\n # The service has a sub-route for critical alerts, any alerts\n # that do not match, i.e. severity != critical, fall-back to the\n # parent node and are sent to 'team-X-mails'\n routes:\n - match:\n severity: critical\n receiver: '${default_receiver}'\n\n# Inhibition rules allow to mute a set of alerts given that another alert is\n# firing.\n# We use this to mute any warning-level notifications if the same alert is\n# already critical.\ninhibit_rules:\n- source_match:\n severity: 'critical'\n target_match:\n severity: 'warning'\n # Apply inhibition if the alertname is the same.\n # CAUTION:\n # If all label names listed in `equal` are missing\n # from both the source and target alerts,\n # the inhibition rule will apply!\n equal: ['alertname', 'cluster', 'service']\n\nreceivers:\n- name: '${default_receiver}'\n slack_configs:\n - channel: '#${slack_channel}'\n send_resolved: true\n icon_url: https://avatars3.githubusercontent.com/u/3380462\n title: |-\n [{{ .Status | toUpper }}{{ if eq .Status \"firing\" }}:{{ .Alerts.Firing | len }}{{ end }}] {{ .CommonLabels.alertname }} for {{ .CommonLabels.job }}\n {{- if gt (len .CommonLabels) (len .GroupLabels) -}}\n {{\" \"}}(\n {{- with .CommonLabels.Remove .GroupLabels.Names }}\n {{- range $index, $label := .SortedPairs -}}\n {{ if $index }}, {{ end }}\n {{- $label.Name }}=\"{{ $label.Value -}}\"\n {{- end }}\n {{- end -}}\n )\n {{- end }}\n text: \u003e-\n {{ range .Alerts -}}\n *Alert:* {{ .Annotations.summary }}{{ if .Labels.severity }} - `{{ .Labels.severity }}`{{ end }}\n\n *Description:* {{ .Annotations.description }}\n\n *Details:*\n {{ range .Labels.SortedPairs }} • *{{ .Name }}:* `{{ .Value }}`\n {{ end }}\n {{ end }}\nEOH\n }\n\n # The service stanza instructs Nomad to register a service with Consul.\n #\n # For more information and examples on the \"task\" stanza, please see\n # the online documentation at:\n #\n # https://www.nomadproject.io/docs/job-specification/service\n #\n service {\n name = \"${service_name}\"\n port = \"${service_name}\"\n tags = [ \"${service_name}$${NOMAD_ALLOC_INDEX}\" ]\n check {\n name = \"Alertmanager Check Live\"\n type = \"http\"\n path = \"/-/healthy\"\n interval = \"10s\"\n timeout = \"2s\"\n }\n }\n\n # The \"resources\" stanza describes the requirements a task needs to\n # execute. Resource requirements include memory, network, cpu, and more.\n # This ensures the task will execute on a machine that contains enough\n # resource capacity.\n #\n # For more information and examples on the \"resources\" stanza, please see\n # the online documentation at:\n #\n # https://www.nomadproject.io/docs/job-specification/resources\n #\n resources {\n cpu = ${cpu}\n memory = ${mem}\n # The network stanza specifies the networking requirements for the task\n # group, including the network mode and port allocations. When scheduling\n # jobs in Nomad they are provisioned across your fleet of machines along\n # with other jobs and services. Because you don't know in advance what host\n # your job will be provisioned on, Nomad will provide your tasks with\n # network configuration when they start up.\n #\n # For more information and examples on the \"template\" stanza, please see\n # the online documentation at:\n #\n # https://www.nomadproject.io/docs/job-specification/network\n #\n network {\n port \"${service_name}\" {\n static = ${port}\n }\n }\n }\n }\n }\n}", + "id": "f723f0544ca4fcac1162853257d62928794f50523b9186cdf2c24684721bd058", + "rendered": "job \"prod-alertmanager\" {\n # The \"region\" parameter specifies the region in which to execute the job.\n # If omitted, this inherits the default region name of \"global\".\n # region = \"global\"\n #\n # The \"datacenters\" parameter specifies the list of datacenters which should\n # be considered when placing this task. This must be provided.\n datacenters = \"yul1\"\n\n # The \"type\" parameter controls the type of job, which impacts the scheduler's\n # decision on placement. This configuration is optional and defaults to\n # \"service\". For a full list of job types and their differences, please see\n # the online documentation.\n #\n # For more information, please see the online documentation at:\n #\n # https://www.nomadproject.io/docs/jobspec/schedulers\n #\n type = \"service\"\n\n update {\n # The \"max_parallel\" parameter specifies the maximum number of updates to\n # perform in parallel. In this case, this specifies to update a single task\n # at a time.\n max_parallel = 1\n\n health_check = \"checks\"\n\n # The \"min_healthy_time\" parameter specifies the minimum time the allocation\n # must be in the healthy state before it is marked as healthy and unblocks\n # further allocations from being updated.\n min_healthy_time = \"10s\"\n\n # The \"healthy_deadline\" parameter specifies the deadline in which the\n # allocation must be marked as healthy after which the allocation is\n # automatically transitioned to unhealthy. Transitioning to unhealthy will\n # fail the deployment and potentially roll back the job if \"auto_revert\" is\n # set to true.\n healthy_deadline = \"3m\"\n\n # The \"progress_deadline\" parameter specifies the deadline in which an\n # allocation must be marked as healthy. The deadline begins when the first\n # allocation for the deployment is created and is reset whenever an allocation\n # as part of the deployment transitions to a healthy state. If no allocation\n # transitions to the healthy state before the progress deadline, the\n # deployment is marked as failed.\n progress_deadline = \"10m\"\n\n\n # The \"canary\" parameter specifies that changes to the job that would result\n # in destructive updates should create the specified number of canaries\n # without stopping any previous allocations. Once the operator determines the\n # canaries are healthy, they can be promoted which unblocks a rolling update\n # of the remaining allocations at a rate of \"max_parallel\".\n #\n # Further, setting \"canary\" equal to the count of the task group allows\n # blue/green deployments. When the job is updated, a full set of the new\n # version is deployed and upon promotion the old version is stopped.\n canary = 1\n\n # Specifies if the job should auto-promote to the canary version when all\n # canaries become healthy during a deployment. Defaults to false which means\n # canaries must be manually updated with the nomad deployment promote\n # command.\n auto_promote = true\n\n # The \"auto_revert\" parameter specifies if the job should auto-revert to the\n # last stable job on deployment failure. A job is marked as stable if all the\n # allocations as part of its deployment were marked healthy.\n auto_revert = true\n\n }\n\n # The \"group\" stanza defines a series of tasks that should be co-located on\n # the same Nomad client. Any task within a group will be placed on the same\n # client.\n #\n # For more information and examples on the \"group\" stanza, please see\n # the online documentation at:\n #\n # https://www.nomadproject.io/docs/job-specification/group\n #\n group \"prod-group1-alertmanager\" {\n # The \"count\" parameter specifies the number of the task groups that should\n # be running under this group. This value must be non-negative and defaults\n # to 1.\n count = 1\n\n # The constraint allows restricting the set of eligible nodes. Constraints\n # may filter on attributes or client metadata.\n #\n # For more information and examples on the \"volume\" stanza, please see\n # the online documentation at:\n #\n # https://www.nomadproject.io/docs/job-specification/constraint\n #\n constraint {\n attribute = \"${attr.cpu.arch}\"\n operator = \"!=\"\n value = \"arm64\"\n }\n\n # The \"task\" stanza creates an individual unit of work, such as a Docker\n # container, web application, or batch processing.\n #\n # For more information and examples on the \"task\" stanza, please see\n # the online documentation at:\n #\n # https://www.nomadproject.io/docs/job-specification/task\n #\n task \"prod-task1-alertmanager\" {\n # The \"driver\" parameter specifies the task driver that should be used to\n # run the task.\n driver = \"exec\"\n\n \n\n # The \"config\" stanza specifies the driver configuration, which is passed\n # directly to the driver to start the task. The details of configurations\n # are specific to each driver, so please see specific driver\n # documentation for more information.\n config {\n command = \"local/alertmanager-0.21.0.linux-amd64/alertmanager\"\n args = [\n \"--config.file=secrets/alertmanager.yml\"\n ]\n }\n\n # The artifact stanza instructs Nomad to fetch and unpack a remote resource,\n # such as a file, tarball, or binary. Nomad downloads artifacts using the\n # popular go-getter library, which permits downloading artifacts from a\n # variety of locations using a URL as the input source.\n #\n # For more information and examples on the \"artifact\" stanza, please see\n # the online documentation at:\n #\n # https://www.nomadproject.io/docs/job-specification/artifact\n #\n artifact {\n source = \"https://github.com/prometheus/alertmanager/releases/download/v0.21.0/alertmanager-0.21.0.linux-amd64.tar.gz\"\n }\n\n # The \"template\" stanza instructs Nomad to manage a template, such as\n # a configuration file or script. This template can optionally pull data\n # from Consul or Vault to populate runtime configuration data.\n #\n # For more information and examples on the \"template\" stanza, please see\n # the online documentation at:\n #\n # https://www.nomadproject.io/docs/job-specification/template\n #\n template {\n change_mode = \"noop\"\n change_signal = \"SIGINT\"\n destination = \"secrets/alertmanager.yml\"\n left_delimiter = \"{{{\"\n right_delimiter = \"}}}\"\n data = \u003c\u003cEOH\nglobal:\n # The API URL to use for Slack notifications.\n slack_api_url: 'https://hooks.slack.com/services/XX'\n\n# The directory from which notification templates are read.\ntemplates:\n- '/etc/alertmanager/template/*.tmpl'\n\n#tls_config:\n# # CA certificate to validate the server certificate with.\n# ca_file: \u003cfilepath\u003e ]\n#\n# # Certificate and key files for client cert authentication to the server.\n# cert_file: \u003cfilepath\u003e\n# key_file: \u003cfilepath\u003e\n#\n# # ServerName extension to indicate the name of the server.\n# # http://tools.ietf.org/html/rfc4366#section-3.1\n# server_name: \u003cstring\u003e\n#\n# # Disable validation of the server certificate.\n# insecure_skip_verify: true\n\n# The root route on which each incoming alert enters.\nroute:\n receiver: 'default-receiver'\n\n # The labels by which incoming alerts are grouped together. For example,\n # multiple alerts coming in for cluster=A and alertname=LatencyHigh would\n # be batched into a single group.\n #\n # To aggregate by all possible labels use '...' as the sole label name.\n # This effectively disables aggregation entirely, passing through all\n # alerts as-is. This is unlikely to be what you want, unless you have\n # a very low alert volume or your upstream notification system performs\n # its own grouping. Example: group_by: [...]\n group_by: ['alertname', 'cluster', 'service']\n\n # When a new group of alerts is created by an incoming alert, wait at\n # least 'group_wait' to send the initial notification.\n # This way ensures that you get multiple alerts for the same group that start\n # firing shortly after another are batched together on the first\n # notification.\n group_wait: 30s\n\n # When the first notification was sent, wait 'group_interval' to send a batch\n # of new alerts that started firing for that group.\n group_interval: 5m\n\n # If an alert has successfully been sent, wait 'repeat_interval' to\n # resend them.\n repeat_interval: 3h\n\n # All the above attributes are inherited by all child routes and can\n # overwritten on each.\n # The child route trees.\n routes:\n # This routes performs a regular expression match on alert labels to\n # catch alerts that are related to a list of services.\n - match_re:\n service: .*\n receiver: default-receiver\n # The service has a sub-route for critical alerts, any alerts\n # that do not match, i.e. severity != critical, fall-back to the\n # parent node and are sent to 'team-X-mails'\n routes:\n - match:\n severity: critical\n receiver: 'default-receiver'\n\n# Inhibition rules allow to mute a set of alerts given that another alert is\n# firing.\n# We use this to mute any warning-level notifications if the same alert is\n# already critical.\ninhibit_rules:\n- source_match:\n severity: 'critical'\n target_match:\n severity: 'warning'\n # Apply inhibition if the alertname is the same.\n # CAUTION:\n # If all label names listed in `equal` are missing\n # from both the source and target alerts,\n # the inhibition rule will apply!\n equal: ['alertname', 'cluster', 'service']\n\nreceivers:\n- name: 'default-receiver'\n slack_configs:\n - channel: '#fdio-infra-monitoring'\n send_resolved: true\n icon_url: https://avatars3.githubusercontent.com/u/3380462\n title: |-\n [{{ .Status | toUpper }}{{ if eq .Status \"firing\" }}:{{ .Alerts.Firing | len }}{{ end }}] {{ .CommonLabels.alertname }} for {{ .CommonLabels.job }}\n {{- if gt (len .CommonLabels) (len .GroupLabels) -}}\n {{\" \"}}(\n {{- with .CommonLabels.Remove .GroupLabels.Names }}\n {{- range $index, $label := .SortedPairs -}}\n {{ if $index }}, {{ end }}\n {{- $label.Name }}=\"{{ $label.Value -}}\"\n {{- end }}\n {{- end -}}\n )\n {{- end }}\n text: \u003e-\n {{ range .Alerts -}}\n *Alert:* {{ .Annotations.summary }}{{ if .Labels.severity }} - `{{ .Labels.severity }}`{{ end }}\n\n *Description:* {{ .Annotations.description }}\n\n *Details:*\n {{ range .Labels.SortedPairs }} • *{{ .Name }}:* `{{ .Value }}`\n {{ end }}\n {{ end }}\nEOH\n }\n\n # The service stanza instructs Nomad to register a service with Consul.\n #\n # For more information and examples on the \"task\" stanza, please see\n # the online documentation at:\n #\n # https://www.nomadproject.io/docs/job-specification/service\n #\n service {\n name = \"alertmanager\"\n port = \"alertmanager\"\n tags = [ \"alertmanager${NOMAD_ALLOC_INDEX}\" ]\n check {\n name = \"Alertmanager Check Live\"\n type = \"http\"\n path = \"/-/healthy\"\n interval = \"10s\"\n timeout = \"2s\"\n }\n }\n\n # The \"resources\" stanza describes the requirements a task needs to\n # execute. Resource requirements include memory, network, cpu, and more.\n # This ensures the task will execute on a machine that contains enough\n # resource capacity.\n #\n # For more information and examples on the \"resources\" stanza, please see\n # the online documentation at:\n #\n # https://www.nomadproject.io/docs/job-specification/resources\n #\n resources {\n cpu = 1000\n memory = 1024\n # The network stanza specifies the networking requirements for the task\n # group, including the network mode and port allocations. When scheduling\n # jobs in Nomad they are provisioned across your fleet of machines along\n # with other jobs and services. Because you don't know in advance what host\n # your job will be provisioned on, Nomad will provide your tasks with\n # network configuration when they start up.\n #\n # For more information and examples on the \"template\" stanza, please see\n # the online documentation at:\n #\n # https://www.nomadproject.io/docs/job-specification/network\n #\n network {\n port \"alertmanager\" {\n static = 9093\n }\n }\n }\n }\n }\n}", + "template": "job \"${job_name}\" {\n # The \"region\" parameter specifies the region in which to execute the job.\n # If omitted, this inherits the default region name of \"global\".\n # region = \"global\"\n #\n # The \"datacenters\" parameter specifies the list of datacenters which should\n # be considered when placing this task. This must be provided.\n datacenters = \"${datacenters}\"\n\n # The \"type\" parameter controls the type of job, which impacts the scheduler's\n # decision on placement. This configuration is optional and defaults to\n # \"service\". For a full list of job types and their differences, please see\n # the online documentation.\n #\n # For more information, please see the online documentation at:\n #\n # https://www.nomadproject.io/docs/jobspec/schedulers\n #\n type = \"service\"\n\n update {\n # The \"max_parallel\" parameter specifies the maximum number of updates to\n # perform in parallel. In this case, this specifies to update a single task\n # at a time.\n max_parallel = 1\n\n health_check = \"checks\"\n\n # The \"min_healthy_time\" parameter specifies the minimum time the allocation\n # must be in the healthy state before it is marked as healthy and unblocks\n # further allocations from being updated.\n min_healthy_time = \"10s\"\n\n # The \"healthy_deadline\" parameter specifies the deadline in which the\n # allocation must be marked as healthy after which the allocation is\n # automatically transitioned to unhealthy. Transitioning to unhealthy will\n # fail the deployment and potentially roll back the job if \"auto_revert\" is\n # set to true.\n healthy_deadline = \"3m\"\n\n # The \"progress_deadline\" parameter specifies the deadline in which an\n # allocation must be marked as healthy. The deadline begins when the first\n # allocation for the deployment is created and is reset whenever an allocation\n # as part of the deployment transitions to a healthy state. If no allocation\n # transitions to the healthy state before the progress deadline, the\n # deployment is marked as failed.\n progress_deadline = \"10m\"\n\n%{ if use_canary }\n # The \"canary\" parameter specifies that changes to the job that would result\n # in destructive updates should create the specified number of canaries\n # without stopping any previous allocations. Once the operator determines the\n # canaries are healthy, they can be promoted which unblocks a rolling update\n # of the remaining allocations at a rate of \"max_parallel\".\n #\n # Further, setting \"canary\" equal to the count of the task group allows\n # blue/green deployments. When the job is updated, a full set of the new\n # version is deployed and upon promotion the old version is stopped.\n canary = 1\n\n # Specifies if the job should auto-promote to the canary version when all\n # canaries become healthy during a deployment. Defaults to false which means\n # canaries must be manually updated with the nomad deployment promote\n # command.\n auto_promote = true\n\n # The \"auto_revert\" parameter specifies if the job should auto-revert to the\n # last stable job on deployment failure. A job is marked as stable if all the\n # allocations as part of its deployment were marked healthy.\n auto_revert = true\n%{ endif }\n }\n\n # The \"group\" stanza defines a series of tasks that should be co-located on\n # the same Nomad client. Any task within a group will be placed on the same\n # client.\n #\n # For more information and examples on the \"group\" stanza, please see\n # the online documentation at:\n #\n # https://www.nomadproject.io/docs/job-specification/group\n #\n group \"prod-group1-${service_name}\" {\n # The \"count\" parameter specifies the number of the task groups that should\n # be running under this group. This value must be non-negative and defaults\n # to 1.\n count = ${group_count}\n\n # The constraint allows restricting the set of eligible nodes. Constraints\n # may filter on attributes or client metadata.\n #\n # For more information and examples on the \"volume\" stanza, please see\n # the online documentation at:\n #\n # https://www.nomadproject.io/docs/job-specification/constraint\n #\n constraint {\n attribute = \"$${attr.cpu.arch}\"\n operator = \"!=\"\n value = \"arm64\"\n }\n\n # The \"task\" stanza creates an individual unit of work, such as a Docker\n # container, web application, or batch processing.\n #\n # For more information and examples on the \"task\" stanza, please see\n # the online documentation at:\n #\n # https://www.nomadproject.io/docs/job-specification/task\n #\n task \"prod-task1-${service_name}\" {\n # The \"driver\" parameter specifies the task driver that should be used to\n # run the task.\n driver = \"exec\"\n\n %{ if use_vault_provider }\n vault {\n policies = \"${vault_kv_policy_name}\"\n }\n %{ endif }\n\n # The \"config\" stanza specifies the driver configuration, which is passed\n # directly to the driver to start the task. The details of configurations\n # are specific to each driver, so please see specific driver\n # documentation for more information.\n config {\n command = \"local/alertmanager-${version}.linux-amd64/alertmanager\"\n args = [\n \"--config.file=secrets/alertmanager.yml\"\n ]\n }\n\n # The artifact stanza instructs Nomad to fetch and unpack a remote resource,\n # such as a file, tarball, or binary. Nomad downloads artifacts using the\n # popular go-getter library, which permits downloading artifacts from a\n # variety of locations using a URL as the input source.\n #\n # For more information and examples on the \"artifact\" stanza, please see\n # the online documentation at:\n #\n # https://www.nomadproject.io/docs/job-specification/artifact\n #\n artifact {\n source = \"${url}\"\n }\n\n # The \"template\" stanza instructs Nomad to manage a template, such as\n # a configuration file or script. This template can optionally pull data\n # from Consul or Vault to populate runtime configuration data.\n #\n # For more information and examples on the \"template\" stanza, please see\n # the online documentation at:\n #\n # https://www.nomadproject.io/docs/job-specification/template\n #\n template {\n change_mode = \"noop\"\n change_signal = \"SIGINT\"\n destination = \"secrets/alertmanager.yml\"\n left_delimiter = \"{{{\"\n right_delimiter = \"}}}\"\n data = \u003c\u003cEOH\nglobal:\n # The API URL to use for Slack notifications.\n slack_api_url: 'https://hooks.slack.com/services/${slack_api_key}'\n\n# The directory from which notification templates are read.\ntemplates:\n- '/etc/alertmanager/template/*.tmpl'\n\n#tls_config:\n# # CA certificate to validate the server certificate with.\n# ca_file: \u003cfilepath\u003e ]\n#\n# # Certificate and key files for client cert authentication to the server.\n# cert_file: \u003cfilepath\u003e\n# key_file: \u003cfilepath\u003e\n#\n# # ServerName extension to indicate the name of the server.\n# # http://tools.ietf.org/html/rfc4366#section-3.1\n# server_name: \u003cstring\u003e\n#\n# # Disable validation of the server certificate.\n# insecure_skip_verify: true\n\n# The root route on which each incoming alert enters.\nroute:\n receiver: '${default_receiver}'\n\n # The labels by which incoming alerts are grouped together. For example,\n # multiple alerts coming in for cluster=A and alertname=LatencyHigh would\n # be batched into a single group.\n #\n # To aggregate by all possible labels use '...' as the sole label name.\n # This effectively disables aggregation entirely, passing through all\n # alerts as-is. This is unlikely to be what you want, unless you have\n # a very low alert volume or your upstream notification system performs\n # its own grouping. Example: group_by: [...]\n group_by: ['alertname', 'cluster', 'service']\n\n # When a new group of alerts is created by an incoming alert, wait at\n # least 'group_wait' to send the initial notification.\n # This way ensures that you get multiple alerts for the same group that start\n # firing shortly after another are batched together on the first\n # notification.\n group_wait: 30s\n\n # When the first notification was sent, wait 'group_interval' to send a batch\n # of new alerts that started firing for that group.\n group_interval: 5m\n\n # If an alert has successfully been sent, wait 'repeat_interval' to\n # resend them.\n repeat_interval: 3h\n\n # All the above attributes are inherited by all child routes and can\n # overwritten on each.\n # The child route trees.\n routes:\n # This routes performs a regular expression match on alert labels to\n # catch alerts that are related to a list of services.\n - match_re:\n service: .*\n receiver: ${default_receiver}\n # The service has a sub-route for critical alerts, any alerts\n # that do not match, i.e. severity != critical, fall-back to the\n # parent node and are sent to 'team-X-mails'\n routes:\n - match:\n severity: critical\n receiver: '${default_receiver}'\n\n# Inhibition rules allow to mute a set of alerts given that another alert is\n# firing.\n# We use this to mute any warning-level notifications if the same alert is\n# already critical.\ninhibit_rules:\n- source_match:\n severity: 'critical'\n target_match:\n severity: 'warning'\n # Apply inhibition if the alertname is the same.\n # CAUTION:\n # If all label names listed in `equal` are missing\n # from both the source and target alerts,\n # the inhibition rule will apply!\n equal: ['alertname', 'cluster', 'service']\n\nreceivers:\n- name: '${default_receiver}'\n slack_configs:\n - channel: '#${slack_channel}'\n send_resolved: true\n icon_url: https://avatars3.githubusercontent.com/u/3380462\n title: |-\n [{{ .Status | toUpper }}{{ if eq .Status \"firing\" }}:{{ .Alerts.Firing | len }}{{ end }}] {{ .CommonLabels.alertname }} for {{ .CommonLabels.job }}\n {{- if gt (len .CommonLabels) (len .GroupLabels) -}}\n {{\" \"}}(\n {{- with .CommonLabels.Remove .GroupLabels.Names }}\n {{- range $index, $label := .SortedPairs -}}\n {{ if $index }}, {{ end }}\n {{- $label.Name }}=\"{{ $label.Value -}}\"\n {{- end }}\n {{- end -}}\n )\n {{- end }}\n text: \u003e-\n {{ range .Alerts -}}\n *Alert:* {{ .Annotations.summary }}{{ if .Labels.severity }} - `{{ .Labels.severity }}`{{ end }}\n\n *Description:* {{ .Annotations.description }}\n\n *Details:*\n {{ range .Labels.SortedPairs }} • *{{ .Name }}:* `{{ .Value }}`\n {{ end }}\n {{ end }}\nEOH\n }\n\n # The service stanza instructs Nomad to register a service with Consul.\n #\n # For more information and examples on the \"task\" stanza, please see\n # the online documentation at:\n #\n # https://www.nomadproject.io/docs/job-specification/service\n #\n service {\n name = \"${service_name}\"\n port = \"${service_name}\"\n tags = [ \"${service_name}$${NOMAD_ALLOC_INDEX}\" ]\n check {\n name = \"Alertmanager Check Live\"\n type = \"http\"\n path = \"/-/healthy\"\n interval = \"10s\"\n timeout = \"2s\"\n }\n }\n\n # The \"resources\" stanza describes the requirements a task needs to\n # execute. Resource requirements include memory, network, cpu, and more.\n # This ensures the task will execute on a machine that contains enough\n # resource capacity.\n #\n # For more information and examples on the \"resources\" stanza, please see\n # the online documentation at:\n #\n # https://www.nomadproject.io/docs/job-specification/resources\n #\n resources {\n cpu = ${cpu}\n memory = ${mem}\n # The network stanza specifies the networking requirements for the task\n # group, including the network mode and port allocations. When scheduling\n # jobs in Nomad they are provisioned across your fleet of machines along\n # with other jobs and services. Because you don't know in advance what host\n # your job will be provisioned on, Nomad will provide your tasks with\n # network configuration when they start up.\n #\n # For more information and examples on the \"template\" stanza, please see\n # the online documentation at:\n #\n # https://www.nomadproject.io/docs/job-specification/network\n #\n network {\n port \"${service_name}\" {\n static = ${port}\n }\n }\n }\n }\n }\n}", "vars": { "cpu": "1000", "datacenters": "yul1", @@ -28,7 +28,7 @@ "mem": "1024", "port": "9093", "service_name": "alertmanager", - "slack_api_url": "https://hooks.slack.com/services/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", + "slack_api_key": "XX", "slack_channel": "fdio-infra-monitoring", "url": "https://github.com/prometheus/alertmanager/releases/download/v0.21.0/alertmanager-0.21.0.linux-amd64.tar.gz", "use_canary": "true", @@ -51,20 +51,20 @@ "schema_version": 0, "attributes": { "allocation_ids": [ - "464144d9-8c27-dc03-71bb-6f8d525bc486" + "bf013613-1ac7-6155-69e0-51ff57719549" ], "datacenters": [ "yul1" ], - "deployment_id": "d69d73a6-2a1b-ebf6-e421-30ce1f34f519", + "deployment_id": "36a8a7ba-bd0c-bce9-bcc1-284e352dfd32", "deployment_status": "successful", "deregister_on_destroy": true, "deregister_on_id_change": true, "detach": false, "id": "prod-alertmanager", - "jobspec": "job \"prod-alertmanager\" {\n # The \"region\" parameter specifies the region in which to execute the job.\n # If omitted, this inherits the default region name of \"global\".\n # region = \"global\"\n #\n # The \"datacenters\" parameter specifies the list of datacenters which should\n # be considered when placing this task. This must be provided.\n datacenters = \"yul1\"\n\n # The \"type\" parameter controls the type of job, which impacts the scheduler's\n # decision on placement. This configuration is optional and defaults to\n # \"service\". For a full list of job types and their differences, please see\n # the online documentation.\n #\n # For more information, please see the online documentation at:\n #\n # https://www.nomadproject.io/docs/jobspec/schedulers\n #\n type = \"service\"\n\n update {\n # The \"max_parallel\" parameter specifies the maximum number of updates to\n # perform in parallel. In this case, this specifies to update a single task\n # at a time.\n max_parallel = 1\n\n health_check = \"checks\"\n\n # The \"min_healthy_time\" parameter specifies the minimum time the allocation\n # must be in the healthy state before it is marked as healthy and unblocks\n # further allocations from being updated.\n min_healthy_time = \"10s\"\n\n # The \"healthy_deadline\" parameter specifies the deadline in which the\n # allocation must be marked as healthy after which the allocation is\n # automatically transitioned to unhealthy. Transitioning to unhealthy will\n # fail the deployment and potentially roll back the job if \"auto_revert\" is\n # set to true.\n healthy_deadline = \"3m\"\n\n # The \"progress_deadline\" parameter specifies the deadline in which an\n # allocation must be marked as healthy. The deadline begins when the first\n # allocation for the deployment is created and is reset whenever an allocation\n # as part of the deployment transitions to a healthy state. If no allocation\n # transitions to the healthy state before the progress deadline, the\n # deployment is marked as failed.\n progress_deadline = \"10m\"\n\n\n # The \"canary\" parameter specifies that changes to the job that would result\n # in destructive updates should create the specified number of canaries\n # without stopping any previous allocations. Once the operator determines the\n # canaries are healthy, they can be promoted which unblocks a rolling update\n # of the remaining allocations at a rate of \"max_parallel\".\n #\n # Further, setting \"canary\" equal to the count of the task group allows\n # blue/green deployments. When the job is updated, a full set of the new\n # version is deployed and upon promotion the old version is stopped.\n canary = 1\n\n # Specifies if the job should auto-promote to the canary version when all\n # canaries become healthy during a deployment. Defaults to false which means\n # canaries must be manually updated with the nomad deployment promote\n # command.\n auto_promote = true\n\n # The \"auto_revert\" parameter specifies if the job should auto-revert to the\n # last stable job on deployment failure. A job is marked as stable if all the\n # allocations as part of its deployment were marked healthy.\n auto_revert = true\n\n }\n\n # The \"group\" stanza defines a series of tasks that should be co-located on\n # the same Nomad client. Any task within a group will be placed on the same\n # client.\n #\n # For more information and examples on the \"group\" stanza, please see\n # the online documentation at:\n #\n # https://www.nomadproject.io/docs/job-specification/group\n #\n group \"prod-group1-alertmanager\" {\n # The \"count\" parameter specifies the number of the task groups that should\n # be running under this group. This value must be non-negative and defaults\n # to 1.\n count = 1\n\n # The constraint allows restricting the set of eligible nodes. Constraints\n # may filter on attributes or client metadata.\n #\n # For more information and examples on the \"volume\" stanza, please see\n # the online documentation at:\n #\n # https://www.nomadproject.io/docs/job-specification/constraint\n #\n constraint {\n attribute = \"${attr.cpu.arch}\"\n operator = \"!=\"\n value = \"arm64\"\n }\n\n # The \"task\" stanza creates an individual unit of work, such as a Docker\n # container, web application, or batch processing.\n #\n # For more information and examples on the \"task\" stanza, please see\n # the online documentation at:\n #\n # https://www.nomadproject.io/docs/job-specification/task\n #\n task \"prod-task1-alertmanager\" {\n # The \"driver\" parameter specifies the task driver that should be used to\n # run the task.\n driver = \"exec\"\n\n \n\n # The \"config\" stanza specifies the driver configuration, which is passed\n # directly to the driver to start the task. The details of configurations\n # are specific to each driver, so please see specific driver\n # documentation for more information.\n config {\n command = \"local/alertmanager-0.21.0.linux-amd64/alertmanager\"\n args = [\n \"--config.file=secrets/alertmanager.yml\"\n ]\n }\n\n # The artifact stanza instructs Nomad to fetch and unpack a remote resource,\n # such as a file, tarball, or binary. Nomad downloads artifacts using the\n # popular go-getter library, which permits downloading artifacts from a\n # variety of locations using a URL as the input source.\n #\n # For more information and examples on the \"artifact\" stanza, please see\n # the online documentation at:\n #\n # https://www.nomadproject.io/docs/job-specification/artifact\n #\n artifact {\n source = \"https://github.com/prometheus/alertmanager/releases/download/v0.21.0/alertmanager-0.21.0.linux-amd64.tar.gz\"\n }\n\n # The \"template\" stanza instructs Nomad to manage a template, such as\n # a configuration file or script. This template can optionally pull data\n # from Consul or Vault to populate runtime configuration data.\n #\n # For more information and examples on the \"template\" stanza, please see\n # the online documentation at:\n #\n # https://www.nomadproject.io/docs/job-specification/template\n #\n template {\n change_mode = \"noop\"\n change_signal = \"SIGINT\"\n destination = \"secrets/alertmanager.yml\"\n left_delimiter = \"{{{\"\n right_delimiter = \"}}}\"\n data = \u003c\u003cEOH\nglobal:\n # The API URL to use for Slack notifications.\n slack_api_url: 'https://hooks.slack.com/services/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'\n\n# The directory from which notification templates are read.\ntemplates:\n- '/etc/alertmanager/template/*.tmpl'\n\n#tls_config:\n# # CA certificate to validate the server certificate with.\n# ca_file: \u003cfilepath\u003e ]\n#\n# # Certificate and key files for client cert authentication to the server.\n# cert_file: \u003cfilepath\u003e\n# key_file: \u003cfilepath\u003e\n#\n# # ServerName extension to indicate the name of the server.\n# # http://tools.ietf.org/html/rfc4366#section-3.1\n# server_name: \u003cstring\u003e\n#\n# # Disable validation of the server certificate.\n# insecure_skip_verify: true\n\n# The root route on which each incoming alert enters.\nroute:\n receiver: 'default-receiver'\n\n # The labels by which incoming alerts are grouped together. For example,\n # multiple alerts coming in for cluster=A and alertname=LatencyHigh would\n # be batched into a single group.\n #\n # To aggregate by all possible labels use '...' as the sole label name.\n # This effectively disables aggregation entirely, passing through all\n # alerts as-is. This is unlikely to be what you want, unless you have\n # a very low alert volume or your upstream notification system performs\n # its own grouping. Example: group_by: [...]\n group_by: ['alertname', 'cluster', 'service']\n\n # When a new group of alerts is created by an incoming alert, wait at\n # least 'group_wait' to send the initial notification.\n # This way ensures that you get multiple alerts for the same group that start\n # firing shortly after another are batched together on the first\n # notification.\n group_wait: 30s\n\n # When the first notification was sent, wait 'group_interval' to send a batch\n # of new alerts that started firing for that group.\n group_interval: 5m\n\n # If an alert has successfully been sent, wait 'repeat_interval' to\n # resend them.\n repeat_interval: 3h\n\n # All the above attributes are inherited by all child routes and can\n # overwritten on each.\n # The child route trees.\n routes:\n # This routes performs a regular expression match on alert labels to\n # catch alerts that are related to a list of services.\n - match_re:\n service: .*\n receiver: default-receiver\n # The service has a sub-route for critical alerts, any alerts\n # that do not match, i.e. severity != critical, fall-back to the\n # parent node and are sent to 'team-X-mails'\n routes:\n - match:\n severity: critical\n receiver: 'default-receiver'\n\n# Inhibition rules allow to mute a set of alerts given that another alert is\n# firing.\n# We use this to mute any warning-level notifications if the same alert is\n# already critical.\ninhibit_rules:\n- source_match:\n severity: 'critical'\n target_match:\n severity: 'warning'\n # Apply inhibition if the alertname is the same.\n # CAUTION:\n # If all label names listed in `equal` are missing\n # from both the source and target alerts,\n # the inhibition rule will apply!\n equal: ['alertname', 'cluster', 'service']\n\nreceivers:\n- name: 'default-receiver'\n slack_configs:\n - channel: '#fdio-infra-monitoring'\n send_resolved: true\n icon_url: https://avatars3.githubusercontent.com/u/3380462\n title: |-\n [{{ .Status | toUpper }}{{ if eq .Status \"firing\" }}:{{ .Alerts.Firing | len }}{{ end }}] {{ .CommonLabels.alertname }} for {{ .CommonLabels.job }}\n {{- if gt (len .CommonLabels) (len .GroupLabels) -}}\n {{\" \"}}(\n {{- with .CommonLabels.Remove .GroupLabels.Names }}\n {{- range $index, $label := .SortedPairs -}}\n {{ if $index }}, {{ end }}\n {{- $label.Name }}=\"{{ $label.Value -}}\"\n {{- end }}\n {{- end -}}\n )\n {{- end }}\n text: \u003e-\n {{ range .Alerts -}}\n *Alert:* {{ .Annotations.summary }}{{ if .Labels.severity }} - `{{ .Labels.severity }}`{{ end }}\n\n *Description:* {{ .Annotations.description }}\n\n *Details:*\n {{ range .Labels.SortedPairs }} • *{{ .Name }}:* `{{ .Value }}`\n {{ end }}\n {{ end }}\nEOH\n }\n\n # The service stanza instructs Nomad to register a service with Consul.\n #\n # For more information and examples on the \"task\" stanza, please see\n # the online documentation at:\n #\n # https://www.nomadproject.io/docs/job-specification/service\n #\n service {\n name = \"alertmanager\"\n port = \"alertmanager\"\n tags = [ \"alertmanager${NOMAD_ALLOC_INDEX}\" ]\n check {\n name = \"Alertmanager Check Live\"\n type = \"http\"\n path = \"/-/healthy\"\n interval = \"10s\"\n timeout = \"2s\"\n }\n }\n\n # The \"resources\" stanza describes the requirements a task needs to\n # execute. Resource requirements include memory, network, cpu, and more.\n # This ensures the task will execute on a machine that contains enough\n # resource capacity.\n #\n # For more information and examples on the \"resources\" stanza, please see\n # the online documentation at:\n #\n # https://www.nomadproject.io/docs/job-specification/resources\n #\n resources {\n cpu = 1000\n memory = 1024\n # The network stanza specifies the networking requirements for the task\n # group, including the network mode and port allocations. When scheduling\n # jobs in Nomad they are provisioned across your fleet of machines along\n # with other jobs and services. Because you don't know in advance what host\n # your job will be provisioned on, Nomad will provide your tasks with\n # network configuration when they start up.\n #\n # For more information and examples on the \"template\" stanza, please see\n # the online documentation at:\n #\n # https://www.nomadproject.io/docs/job-specification/network\n #\n network {\n port \"alertmanager\" {\n static = 9093\n }\n }\n }\n }\n }\n}", + "jobspec": "job \"prod-alertmanager\" {\n # The \"region\" parameter specifies the region in which to execute the job.\n # If omitted, this inherits the default region name of \"global\".\n # region = \"global\"\n #\n # The \"datacenters\" parameter specifies the list of datacenters which should\n # be considered when placing this task. This must be provided.\n datacenters = \"yul1\"\n\n # The \"type\" parameter controls the type of job, which impacts the scheduler's\n # decision on placement. This configuration is optional and defaults to\n # \"service\". For a full list of job types and their differences, please see\n # the online documentation.\n #\n # For more information, please see the online documentation at:\n #\n # https://www.nomadproject.io/docs/jobspec/schedulers\n #\n type = \"service\"\n\n update {\n # The \"max_parallel\" parameter specifies the maximum number of updates to\n # perform in parallel. In this case, this specifies to update a single task\n # at a time.\n max_parallel = 1\n\n health_check = \"checks\"\n\n # The \"min_healthy_time\" parameter specifies the minimum time the allocation\n # must be in the healthy state before it is marked as healthy and unblocks\n # further allocations from being updated.\n min_healthy_time = \"10s\"\n\n # The \"healthy_deadline\" parameter specifies the deadline in which the\n # allocation must be marked as healthy after which the allocation is\n # automatically transitioned to unhealthy. Transitioning to unhealthy will\n # fail the deployment and potentially roll back the job if \"auto_revert\" is\n # set to true.\n healthy_deadline = \"3m\"\n\n # The \"progress_deadline\" parameter specifies the deadline in which an\n # allocation must be marked as healthy. The deadline begins when the first\n # allocation for the deployment is created and is reset whenever an allocation\n # as part of the deployment transitions to a healthy state. If no allocation\n # transitions to the healthy state before the progress deadline, the\n # deployment is marked as failed.\n progress_deadline = \"10m\"\n\n\n # The \"canary\" parameter specifies that changes to the job that would result\n # in destructive updates should create the specified number of canaries\n # without stopping any previous allocations. Once the operator determines the\n # canaries are healthy, they can be promoted which unblocks a rolling update\n # of the remaining allocations at a rate of \"max_parallel\".\n #\n # Further, setting \"canary\" equal to the count of the task group allows\n # blue/green deployments. When the job is updated, a full set of the new\n # version is deployed and upon promotion the old version is stopped.\n canary = 1\n\n # Specifies if the job should auto-promote to the canary version when all\n # canaries become healthy during a deployment. Defaults to false which means\n # canaries must be manually updated with the nomad deployment promote\n # command.\n auto_promote = true\n\n # The \"auto_revert\" parameter specifies if the job should auto-revert to the\n # last stable job on deployment failure. A job is marked as stable if all the\n # allocations as part of its deployment were marked healthy.\n auto_revert = true\n\n }\n\n # The \"group\" stanza defines a series of tasks that should be co-located on\n # the same Nomad client. Any task within a group will be placed on the same\n # client.\n #\n # For more information and examples on the \"group\" stanza, please see\n # the online documentation at:\n #\n # https://www.nomadproject.io/docs/job-specification/group\n #\n group \"prod-group1-alertmanager\" {\n # The \"count\" parameter specifies the number of the task groups that should\n # be running under this group. This value must be non-negative and defaults\n # to 1.\n count = 1\n\n # The constraint allows restricting the set of eligible nodes. Constraints\n # may filter on attributes or client metadata.\n #\n # For more information and examples on the \"volume\" stanza, please see\n # the online documentation at:\n #\n # https://www.nomadproject.io/docs/job-specification/constraint\n #\n constraint {\n attribute = \"${attr.cpu.arch}\"\n operator = \"!=\"\n value = \"arm64\"\n }\n\n # The \"task\" stanza creates an individual unit of work, such as a Docker\n # container, web application, or batch processing.\n #\n # For more information and examples on the \"task\" stanza, please see\n # the online documentation at:\n #\n # https://www.nomadproject.io/docs/job-specification/task\n #\n task \"prod-task1-alertmanager\" {\n # The \"driver\" parameter specifies the task driver that should be used to\n # run the task.\n driver = \"exec\"\n\n \n\n # The \"config\" stanza specifies the driver configuration, which is passed\n # directly to the driver to start the task. The details of configurations\n # are specific to each driver, so please see specific driver\n # documentation for more information.\n config {\n command = \"local/alertmanager-0.21.0.linux-amd64/alertmanager\"\n args = [\n \"--config.file=secrets/alertmanager.yml\"\n ]\n }\n\n # The artifact stanza instructs Nomad to fetch and unpack a remote resource,\n # such as a file, tarball, or binary. Nomad downloads artifacts using the\n # popular go-getter library, which permits downloading artifacts from a\n # variety of locations using a URL as the input source.\n #\n # For more information and examples on the \"artifact\" stanza, please see\n # the online documentation at:\n #\n # https://www.nomadproject.io/docs/job-specification/artifact\n #\n artifact {\n source = \"https://github.com/prometheus/alertmanager/releases/download/v0.21.0/alertmanager-0.21.0.linux-amd64.tar.gz\"\n }\n\n # The \"template\" stanza instructs Nomad to manage a template, such as\n # a configuration file or script. This template can optionally pull data\n # from Consul or Vault to populate runtime configuration data.\n #\n # For more information and examples on the \"template\" stanza, please see\n # the online documentation at:\n #\n # https://www.nomadproject.io/docs/job-specification/template\n #\n template {\n change_mode = \"noop\"\n change_signal = \"SIGINT\"\n destination = \"secrets/alertmanager.yml\"\n left_delimiter = \"{{{\"\n right_delimiter = \"}}}\"\n data = \u003c\u003cEOH\nglobal:\n # The API URL to use for Slack notifications.\n slack_api_url: 'https://hooks.slack.com/services/XX'\n\n# The directory from which notification templates are read.\ntemplates:\n- '/etc/alertmanager/template/*.tmpl'\n\n#tls_config:\n# # CA certificate to validate the server certificate with.\n# ca_file: \u003cfilepath\u003e ]\n#\n# # Certificate and key files for client cert authentication to the server.\n# cert_file: \u003cfilepath\u003e\n# key_file: \u003cfilepath\u003e\n#\n# # ServerName extension to indicate the name of the server.\n# # http://tools.ietf.org/html/rfc4366#section-3.1\n# server_name: \u003cstring\u003e\n#\n# # Disable validation of the server certificate.\n# insecure_skip_verify: true\n\n# The root route on which each incoming alert enters.\nroute:\n receiver: 'default-receiver'\n\n # The labels by which incoming alerts are grouped together. For example,\n # multiple alerts coming in for cluster=A and alertname=LatencyHigh would\n # be batched into a single group.\n #\n # To aggregate by all possible labels use '...' as the sole label name.\n # This effectively disables aggregation entirely, passing through all\n # alerts as-is. This is unlikely to be what you want, unless you have\n # a very low alert volume or your upstream notification system performs\n # its own grouping. Example: group_by: [...]\n group_by: ['alertname', 'cluster', 'service']\n\n # When a new group of alerts is created by an incoming alert, wait at\n # least 'group_wait' to send the initial notification.\n # This way ensures that you get multiple alerts for the same group that start\n # firing shortly after another are batched together on the first\n # notification.\n group_wait: 30s\n\n # When the first notification was sent, wait 'group_interval' to send a batch\n # of new alerts that started firing for that group.\n group_interval: 5m\n\n # If an alert has successfully been sent, wait 'repeat_interval' to\n # resend them.\n repeat_interval: 3h\n\n # All the above attributes are inherited by all child routes and can\n # overwritten on each.\n # The child route trees.\n routes:\n # This routes performs a regular expression match on alert labels to\n # catch alerts that are related to a list of services.\n - match_re:\n service: .*\n receiver: default-receiver\n # The service has a sub-route for critical alerts, any alerts\n # that do not match, i.e. severity != critical, fall-back to the\n # parent node and are sent to 'team-X-mails'\n routes:\n - match:\n severity: critical\n receiver: 'default-receiver'\n\n# Inhibition rules allow to mute a set of alerts given that another alert is\n# firing.\n# We use this to mute any warning-level notifications if the same alert is\n# already critical.\ninhibit_rules:\n- source_match:\n severity: 'critical'\n target_match:\n severity: 'warning'\n # Apply inhibition if the alertname is the same.\n # CAUTION:\n # If all label names listed in `equal` are missing\n # from both the source and target alerts,\n # the inhibition rule will apply!\n equal: ['alertname', 'cluster', 'service']\n\nreceivers:\n- name: 'default-receiver'\n slack_configs:\n - channel: '#fdio-infra-monitoring'\n send_resolved: true\n icon_url: https://avatars3.githubusercontent.com/u/3380462\n title: |-\n [{{ .Status | toUpper }}{{ if eq .Status \"firing\" }}:{{ .Alerts.Firing | len }}{{ end }}] {{ .CommonLabels.alertname }} for {{ .CommonLabels.job }}\n {{- if gt (len .CommonLabels) (len .GroupLabels) -}}\n {{\" \"}}(\n {{- with .CommonLabels.Remove .GroupLabels.Names }}\n {{- range $index, $label := .SortedPairs -}}\n {{ if $index }}, {{ end }}\n {{- $label.Name }}=\"{{ $label.Value -}}\"\n {{- end }}\n {{- end -}}\n )\n {{- end }}\n text: \u003e-\n {{ range .Alerts -}}\n *Alert:* {{ .Annotations.summary }}{{ if .Labels.severity }} - `{{ .Labels.severity }}`{{ end }}\n\n *Description:* {{ .Annotations.description }}\n\n *Details:*\n {{ range .Labels.SortedPairs }} • *{{ .Name }}:* `{{ .Value }}`\n {{ end }}\n {{ end }}\nEOH\n }\n\n # The service stanza instructs Nomad to register a service with Consul.\n #\n # For more information and examples on the \"task\" stanza, please see\n # the online documentation at:\n #\n # https://www.nomadproject.io/docs/job-specification/service\n #\n service {\n name = \"alertmanager\"\n port = \"alertmanager\"\n tags = [ \"alertmanager${NOMAD_ALLOC_INDEX}\" ]\n check {\n name = \"Alertmanager Check Live\"\n type = \"http\"\n path = \"/-/healthy\"\n interval = \"10s\"\n timeout = \"2s\"\n }\n }\n\n # The \"resources\" stanza describes the requirements a task needs to\n # execute. Resource requirements include memory, network, cpu, and more.\n # This ensures the task will execute on a machine that contains enough\n # resource capacity.\n #\n # For more information and examples on the \"resources\" stanza, please see\n # the online documentation at:\n #\n # https://www.nomadproject.io/docs/job-specification/resources\n #\n resources {\n cpu = 1000\n memory = 1024\n # The network stanza specifies the networking requirements for the task\n # group, including the network mode and port allocations. When scheduling\n # jobs in Nomad they are provisioned across your fleet of machines along\n # with other jobs and services. Because you don't know in advance what host\n # your job will be provisioned on, Nomad will provide your tasks with\n # network configuration when they start up.\n #\n # For more information and examples on the \"template\" stanza, please see\n # the online documentation at:\n #\n # https://www.nomadproject.io/docs/job-specification/network\n #\n network {\n port \"alertmanager\" {\n static = 9093\n }\n }\n }\n }\n }\n}", "json": null, - "modify_index": "7147924", + "modify_index": "7150816", "name": "prod-alertmanager", "namespace": "default", "policy_override": null, @@ -107,9 +107,9 @@ "schema_version": 0, "attributes": { "filename": null, - "id": "28db119a83617686c94496ed114455041d4fc500d93c4de8f8b8cfbe7d4c1db2", - "rendered": "job \"prod-exporter\" {\n # The \"region\" parameter specifies the region in which to execute the job.\n # If omitted, this inherits the default region name of \"global\".\n # region = \"global\"\n #\n # The \"datacenters\" parameter specifies the list of datacenters which should\n # be considered when placing this task. This must be provided.\n datacenters = \"yul1\"\n\n # The \"type\" parameter controls the type of job, which impacts the scheduler's\n # decision on placement. This configuration is optional and defaults to\n # \"service\". For a full list of job types and their differences, please see\n # the online documentation.\n #\n # https://www.nomadproject.io/docs/jobspec/schedulers\n #\n type = \"system\"\n\n update {\n # The \"max_parallel\" parameter specifies the maximum number of updates to\n # perform in parallel. In this case, this specifies to update a single task\n # at a time.\n max_parallel = 1\n\n health_check = \"checks\"\n\n # The \"min_healthy_time\" parameter specifies the minimum time the allocation\n # must be in the healthy state before it is marked as healthy and unblocks\n # further allocations from being updated.\n min_healthy_time = \"10s\"\n\n # The \"healthy_deadline\" parameter specifies the deadline in which the\n # allocation must be marked as healthy after which the allocation is\n # automatically transitioned to unhealthy. Transitioning to unhealthy will\n # fail the deployment and potentially roll back the job if \"auto_revert\" is\n # set to true.\n healthy_deadline = \"3m\"\n\n # The \"progress_deadline\" parameter specifies the deadline in which an\n # allocation must be marked as healthy. The deadline begins when the first\n # allocation for the deployment is created and is reset whenever an allocation\n # as part of the deployment transitions to a healthy state. If no allocation\n # transitions to the healthy state before the progress deadline, the\n # deployment is marked as failed.\n progress_deadline = \"10m\"\n\n\n }\n\n # The \"group\" stanza defines a series of tasks that should be co-located on\n # the same Nomad client. Any task within a group will be placed on the same\n # client.\n #\n # https://www.nomadproject.io/docs/job-specification/group\n #\n group \"prod-group1-exporter-amd64\" {\n # The constraint allows restricting the set of eligible nodes. Constraints\n # may filter on attributes or client metadata.\n #\n # https://www.nomadproject.io/docs/job-specification/constraint\n #\n constraint {\n attribute = \"${attr.cpu.arch}\"\n operator = \"!=\"\n value = \"arm64\"\n }\n\n # The \"task\" stanza creates an individual unit of work, such as a Docker\n # container, web application, or batch processing.\n #\n # https://www.nomadproject.io/docs/job-specification/task\n #\n task \"prod-task1-nodeexporter-amd64\" {\n # The \"driver\" parameter specifies the task driver that should be used to\n # run the task.\n driver = \"raw_exec\"\n\n # The \"config\" stanza specifies the driver configuration, which is passed\n # directly to the driver to start the task. The details of configurations\n # are specific to each driver, so please see specific driver\n # documentation for more information.\n config {\n command = \"local/node_exporter-1.0.1.linux-amd64/node_exporter\"\n }\n\n # The artifact stanza instructs Nomad to fetch and unpack a remote resource,\n # such as a file, tarball, or binary. Nomad downloads artifacts using the\n # popular go-getter library, which permits downloading artifacts from a\n # variety of locations using a URL as the input source.\n #\n # https://www.nomadproject.io/docs/job-specification/artifact\n #\n artifact {\n source = \"https://github.com/prometheus/node_exporter/releases/download/v1.0.1/node_exporter-1.0.1.linux-amd64.tar.gz\"\n }\n\n # The service stanza instructs Nomad to register a service with Consul.\n #\n # https://www.nomadproject.io/docs/job-specification/service\n #\n service {\n name = \"nodeexporter\"\n port = \"nodeexporter\"\n check {\n name = \"Node Exporter Check Live\"\n type = \"http\"\n path = \"/metrics\"\n interval = \"10s\"\n timeout = \"2s\"\n }\n }\n\n # The \"resources\" stanza describes the requirements a task needs to\n # execute. Resource requirements include memory, network, cpu, and more.\n # This ensures the task will execute on a machine that contains enough\n # resource capacity.\n #\n # https://www.nomadproject.io/docs/job-specification/resources\n #\n resources {\n cpu = 500\n # The network stanza specifies the networking requirements for the task\n # group, including the network mode and port allocations. When scheduling\n # jobs in Nomad they are provisioned across your fleet of machines along\n # with other jobs and services. Because you don't know in advance what host\n # your job will be provisioned on, Nomad will provide your tasks with\n # network configuration when they start up.\n #\n # https://www.nomadproject.io/docs/job-specification/network\n #\n network {\n port \"nodeexporter\" {\n static = 9100\n }\n }\n }\n }\n task \"prod-task2-blackboxexporter-amd64\" {\n # The \"driver\" parameter specifies the task driver that should be used to\n # run the task.\n driver = \"exec\"\n\n # The \"config\" stanza specifies the driver configuration, which is passed\n # directly to the driver to start the task. The details of configurations\n # are specific to each driver, so please see specific driver\n # documentation for more information.\n config {\n command = \"local/blackbox_exporter-0.18.0.linux-amd64/blackbox_exporter\"\n args = [\n \"--config.file=secrets/blackbox.yml\"\n ]\n }\n\n # The \"template\" stanza instructs Nomad to manage a template, such as\n # a configuration file or script. This template can optionally pull data\n # from Consul or Vault to populate runtime configuration data.\n #\n # https://www.nomadproject.io/docs/job-specification/template\n #\n template {\n change_mode = \"noop\"\n change_signal = \"SIGINT\"\n destination = \"secrets/blackbox.yml\"\n data = \u003c\u003cEOH\nmodules:\n http_2xx:\n prober: http\n timeout: 5s\n http:\n valid_http_versions: [\"HTTP/1.1\", \"HTTP/2.0\"]\n no_follow_redirects: false\n fail_if_ssl: false\n fail_if_not_ssl: true\n tls_config:\n insecure_skip_verify: false\n preferred_ip_protocol: \"ip4\"\n icmp_v4:\n prober: icmp\n timeout: 5s\n icmp:\n preferred_ip_protocol: \"ip4\"\n dns_udp:\n prober: dns\n timeout: 5s\n dns:\n query_name: \"jenkins.fd.io\"\n query_type: \"A\"\n valid_rcodes:\n - NOERROR\nEOH\n }\n\n # The artifact stanza instructs Nomad to fetch and unpack a remote resource,\n # such as a file, tarball, or binary. Nomad downloads artifacts using the\n # popular go-getter library, which permits downloading artifacts from a\n # variety of locations using a URL as the input source.\n #\n # https://www.nomadproject.io/docs/job-specification/artifact\n #\n artifact {\n source = \"https://github.com/prometheus/blackbox_exporter/releases/download/v0.18.0/blackbox_exporter-0.18.0.linux-amd64.tar.gz\"\n }\n\n # The service stanza instructs Nomad to register a service with Consul.\n #\n # https://www.nomadproject.io/docs/job-specification/service\n #\n service {\n name = \"blackboxexporter\"\n port = \"blackboxexporter\"\n tags = [ \"blackboxexporter${NOMAD_ALLOC_INDEX}\" ]\n check {\n name = \"Blackbox Exporter Check Live\"\n type = \"http\"\n path = \"/metrics\"\n interval = \"10s\"\n timeout = \"2s\"\n }\n }\n\n # The \"resources\" stanza describes the requirements a task needs to\n # execute. Resource requirements include memory, network, cpu, and more.\n # This ensures the task will execute on a machine that contains enough\n # resource capacity.\n #\n # https://www.nomadproject.io/docs/job-specification/resources\n #\n resources {\n cpu = 500\n # The network stanza specifies the networking requirements for the task\n # group, including the network mode and port allocations. When scheduling\n # jobs in Nomad they are provisioned across your fleet of machines along\n # with other jobs and services. Because you don't know in advance what host\n # your job will be provisioned on, Nomad will provide your tasks with\n # network configuration when they start up.\n #\n # https://www.nomadproject.io/docs/job-specification/network\n #\n network {\n port \"blackboxexporter\" {\n static = 9115\n }\n }\n }\n }\n\n task \"prod-task3-cadvisorexporter-amd64\" {\n # The \"driver\" parameter specifies the task driver that should be used to\n # run the task.\n driver = \"docker\"\n\n # The \"config\" stanza specifies the driver configuration, which is passed\n # directly to the driver to start the task. The details of configurations\n # are specific to each driver, so please see specific driver\n # documentation for more information.\n config {\n image = \"gcr.io/cadvisor/cadvisor:latest\"\n volumes = [\n \"/:/rootfs:ro\",\n \"/var/run:/var/run:rw\",\n \"/sys:/sys:ro\",\n \"/var/lib/docker/:/var/lib/docker:ro\",\n \"/cgroup:/cgroup:ro\"\n ]\n }\n\n # The service stanza instructs Nomad to register a service with Consul.\n #\n # https://www.nomadproject.io/docs/job-specification/service\n #\n service {\n name = \"cadvisorexporter\"\n port = \"cadvisorexporter\"\n check {\n name = \"cAdvisor Check Live\"\n type = \"http\"\n path = \"/metrics\"\n interval = \"10s\"\n timeout = \"2s\"\n }\n }\n\n # The \"resources\" stanza describes the requirements a task needs to\n # execute. Resource requirements include memory, network, cpu, and more.\n # This ensures the task will execute on a machine that contains enough\n # resource capacity.\n #\n # https://www.nomadproject.io/docs/job-specification/resources\n #\n resources {\n cpu = 500\n # The network stanza specifies the networking requirements for the task\n # group, including the network mode and port allocations. When scheduling\n # jobs in Nomad they are provisioned across your fleet of machines along\n # with other jobs and services. Because you don't know in advance what host\n # your job will be provisioned on, Nomad will provide your tasks with\n # network configuration when they start up.\n #\n # https://www.nomadproject.io/docs/job-specification/network\n #\n network {\n port \"cadvisorexporter\" {\n static = 8080\n }\n }\n }\n }\n }\n\n group \"prod-group1-exporter-arm64\" {\n # The constraint allows restricting the set of eligible nodes. Constraints\n # may filter on attributes or client metadata.\n #\n # https://www.nomadproject.io/docs/job-specification/constraint\n #\n constraint {\n attribute = \"${attr.cpu.arch}\"\n operator = \"==\"\n value = \"arm64\"\n }\n\n # The \"task\" stanza creates an individual unit of work, such as a Docker\n # container, web application, or batch processing.\n #\n # https://www.nomadproject.io/docs/job-specification/task\n #\n task \"prod-task1-nodeexporter-arm64\" {\n # The \"driver\" parameter specifies the task driver that should be used to\n # run the task.\n driver = \"raw_exec\"\n\n # The \"config\" stanza specifies the driver configuration, which is passed\n # directly to the driver to start the task. The details of configurations\n # are specific to each driver, so please see specific driver\n # documentation for more information.\n config {\n command = \"local/node_exporter-1.0.1.linux-arm64/node_exporter\"\n }\n\n # The artifact stanza instructs Nomad to fetch and unpack a remote resource,\n # such as a file, tarball, or binary. Nomad downloads artifacts using the\n # popular go-getter library, which permits downloading artifacts from a\n # variety of locations using a URL as the input source.\n #\n # https://www.nomadproject.io/docs/job-specification/artifact\n #\n artifact {\n source = \"https://github.com/prometheus/node_exporter/releases/download/v1.0.1/node_exporter-1.0.1.linux-arm64.tar.gz\"\n }\n\n # The service stanza instructs Nomad to register a service with Consul.\n #\n # https://www.nomadproject.io/docs/job-specification/service\n #\n service {\n name = \"nodeexporter\"\n port = \"nodeexporter\"\n check {\n name = \"Node Exporter Check Live\"\n type = \"http\"\n path = \"/metrics\"\n interval = \"10s\"\n timeout = \"2s\"\n }\n }\n\n # The \"resources\" stanza describes the requirements a task needs to\n # execute. Resource requirements include memory, network, cpu, and more.\n # This ensures the task will execute on a machine that contains enough\n # resource capacity.\n #\n # https://www.nomadproject.io/docs/job-specification/resources\n #\n resources {\n cpu = 500\n # The network stanza specifies the networking requirements for the task\n # group, including the network mode and port allocations. When scheduling\n # jobs in Nomad they are provisioned across your fleet of machines along\n # with other jobs and services. Because you don't know in advance what host\n # your job will be provisioned on, Nomad will provide your tasks with\n # network configuration when they start up.\n #\n # https://www.nomadproject.io/docs/job-specification/network\n #\n network {\n port \"nodeexporter\" {\n static = 9100\n }\n }\n }\n }\n\n task \"prod-task2-blackboxexporter-arm64\" {\n # The \"driver\" parameter specifies the task driver that should be used to\n # run the task.\n driver = \"exec\"\n\n # The \"config\" stanza specifies the driver configuration, which is passed\n # directly to the driver to start the task. The details of configurations\n # are specific to each driver, so please see specific driver\n # documentation for more information.\n config {\n command = \"local/blackbox_exporter-0.18.0.linux-arm64/blackbox_exporter\"\n args = [\n \"--config.file=secrets/blackbox.yml\"\n ]\n }\n\n # The \"template\" stanza instructs Nomad to manage a template, such as\n # a configuration file or script. This template can optionally pull data\n # from Consul or Vault to populate runtime configuration data.\n #\n # https://www.nomadproject.io/docs/job-specification/template\n #\n template {\n change_mode = \"noop\"\n change_signal = \"SIGINT\"\n destination = \"secrets/blackbox.yml\"\n data = \u003c\u003cEOH\nmodules:\n http_2xx:\n prober: http\n timeout: 5s\n http:\n valid_http_versions: [\"HTTP/1.1\", \"HTTP/2.0\"]\n no_follow_redirects: false\n fail_if_ssl: false\n fail_if_not_ssl: true\n tls_config:\n insecure_skip_verify: false\n preferred_ip_protocol: \"ip4\"\n icmp_v4:\n prober: icmp\n timeout: 5s\n icmp:\n preferred_ip_protocol: \"ip4\"\n dns_udp:\n prober: dns\n timeout: 5s\n dns:\n query_name: \"jenkins.fd.io\"\n query_type: \"A\"\n valid_rcodes:\n - NOERROR\nEOH\n }\n\n # The artifact stanza instructs Nomad to fetch and unpack a remote resource,\n # such as a file, tarball, or binary. Nomad downloads artifacts using the\n # popular go-getter library, which permits downloading artifacts from a\n # variety of locations using a URL as the input source.\n #\n # https://www.nomadproject.io/docs/job-specification/artifact\n #\n artifact {\n source = \"https://github.com/prometheus/blackbox_exporter/releases/download/v0.18.0/blackbox_exporter-0.18.0.linux-arm64.tar.gz\"\n }\n\n # The service stanza instructs Nomad to register a service with Consul.\n #\n # https://www.nomadproject.io/docs/job-specification/service\n #\n service {\n name = \"blackboxexporter\"\n port = \"blackboxexporter\"\n tags = [ \"blackboxexporter${NOMAD_ALLOC_INDEX}\" ]\n check {\n name = \"Blackbox Exporter Check Live\"\n type = \"http\"\n path = \"/metrics\"\n interval = \"10s\"\n timeout = \"2s\"\n }\n }\n\n # The \"resources\" stanza describes the requirements a task needs to\n # execute. Resource requirements include memory, network, cpu, and more.\n # This ensures the task will execute on a machine that contains enough\n # resource capacity.\n #\n # https://www.nomadproject.io/docs/job-specification/resources\n #\n resources {\n cpu = 500\n # The network stanza specifies the networking requirements for the task\n # group, including the network mode and port allocations. When scheduling\n # jobs in Nomad they are provisioned across your fleet of machines along\n # with other jobs and services. Because you don't know in advance what host\n # your job will be provisioned on, Nomad will provide your tasks with\n # network configuration when they start up.\n #\n # https://www.nomadproject.io/docs/job-specification/network\n #\n network {\n port \"blackboxexporter\" {\n static = 9115\n }\n }\n }\n }\n\n task \"prod-task3-cadvisorexporter-arm64\" {\n # The \"driver\" parameter specifies the task driver that should be used to\n # run the task.\n driver = \"docker\"\n\n # The \"config\" stanza specifies the driver configuration, which is passed\n # directly to the driver to start the task. The details of configurations\n # are specific to each driver, so please see specific driver\n # documentation for more information.\n config {\n # There is currently no official release for arm yet...using community.\n image = \"zcube/cadvisor:latest\"\n volumes = [\n \"/:/rootfs:ro\",\n \"/var/run:/var/run:rw\",\n \"/sys:/sys:ro\",\n \"/var/lib/docker/:/var/lib/docker:ro\",\n \"/cgroup:/cgroup:ro\"\n ]\n }\n\n # The service stanza instructs Nomad to register a service with Consul.\n #\n # https://www.nomadproject.io/docs/job-specification/service\n #\n service {\n name = \"cadvisorexporter\"\n port = \"cadvisorexporter\"\n check {\n name = \"cAdvisor Check Live\"\n type = \"http\"\n path = \"/metrics\"\n interval = \"10s\"\n timeout = \"2s\"\n }\n }\n\n # The \"resources\" stanza describes the requirements a task needs to\n # execute. Resource requirements include memory, network, cpu, and more.\n # This ensures the task will execute on a machine that contains enough\n # resource capacity.\n #\n # https://www.nomadproject.io/docs/job-specification/resources\n #\n resources {\n cpu = 500\n # The network stanza specifies the networking requirements for the task\n # group, including the network mode and port allocations. When scheduling\n # jobs in Nomad they are provisioned across your fleet of machines along\n # with other jobs and services. Because you don't know in advance what host\n # your job will be provisioned on, Nomad will provide your tasks with\n # network configuration when they start up.\n #\n # https://www.nomadproject.io/docs/job-specification/network\n #\n network {\n port \"cadvisorexporter\" {\n static = 8080\n }\n }\n }\n }\n }\n}", - "template": "job \"${job_name}\" {\n # The \"region\" parameter specifies the region in which to execute the job.\n # If omitted, this inherits the default region name of \"global\".\n # region = \"global\"\n #\n # The \"datacenters\" parameter specifies the list of datacenters which should\n # be considered when placing this task. This must be provided.\n datacenters = \"${datacenters}\"\n\n # The \"type\" parameter controls the type of job, which impacts the scheduler's\n # decision on placement. This configuration is optional and defaults to\n # \"service\". For a full list of job types and their differences, please see\n # the online documentation.\n #\n # https://www.nomadproject.io/docs/jobspec/schedulers\n #\n type = \"system\"\n\n update {\n # The \"max_parallel\" parameter specifies the maximum number of updates to\n # perform in parallel. In this case, this specifies to update a single task\n # at a time.\n max_parallel = 1\n\n health_check = \"checks\"\n\n # The \"min_healthy_time\" parameter specifies the minimum time the allocation\n # must be in the healthy state before it is marked as healthy and unblocks\n # further allocations from being updated.\n min_healthy_time = \"10s\"\n\n # The \"healthy_deadline\" parameter specifies the deadline in which the\n # allocation must be marked as healthy after which the allocation is\n # automatically transitioned to unhealthy. Transitioning to unhealthy will\n # fail the deployment and potentially roll back the job if \"auto_revert\" is\n # set to true.\n healthy_deadline = \"3m\"\n\n # The \"progress_deadline\" parameter specifies the deadline in which an\n # allocation must be marked as healthy. The deadline begins when the first\n # allocation for the deployment is created and is reset whenever an allocation\n # as part of the deployment transitions to a healthy state. If no allocation\n # transitions to the healthy state before the progress deadline, the\n # deployment is marked as failed.\n progress_deadline = \"10m\"\n\n%{ if use_canary }\n # The \"canary\" parameter specifies that changes to the job that would result\n # in destructive updates should create the specified number of canaries\n # without stopping any previous allocations. Once the operator determines the\n # canaries are healthy, they can be promoted which unblocks a rolling update\n # of the remaining allocations at a rate of \"max_parallel\".\n #\n # Further, setting \"canary\" equal to the count of the task group allows\n # blue/green deployments. When the job is updated, a full set of the new\n # version is deployed and upon promotion the old version is stopped.\n canary = 1\n\n # Specifies if the job should auto-promote to the canary version when all\n # canaries become healthy during a deployment. Defaults to false which means\n # canaries must be manually updated with the nomad deployment promote\n # command.\n auto_promote = true\n\n # The \"auto_revert\" parameter specifies if the job should auto-revert to the\n # last stable job on deployment failure. A job is marked as stable if all the\n # allocations as part of its deployment were marked healthy.\n auto_revert = true\n%{ endif }\n }\n\n # The \"group\" stanza defines a series of tasks that should be co-located on\n # the same Nomad client. Any task within a group will be placed on the same\n # client.\n #\n # https://www.nomadproject.io/docs/job-specification/group\n #\n group \"prod-group1-exporter-amd64\" {\n # The constraint allows restricting the set of eligible nodes. Constraints\n # may filter on attributes or client metadata.\n #\n # https://www.nomadproject.io/docs/job-specification/constraint\n #\n constraint {\n attribute = \"$${attr.cpu.arch}\"\n operator = \"!=\"\n value = \"arm64\"\n }\n\n # The \"task\" stanza creates an individual unit of work, such as a Docker\n # container, web application, or batch processing.\n #\n # https://www.nomadproject.io/docs/job-specification/task\n #\n task \"prod-task1-${node_service_name}-amd64\" {\n # The \"driver\" parameter specifies the task driver that should be used to\n # run the task.\n driver = \"raw_exec\"\n\n # The \"config\" stanza specifies the driver configuration, which is passed\n # directly to the driver to start the task. The details of configurations\n # are specific to each driver, so please see specific driver\n # documentation for more information.\n config {\n command = \"local/node_exporter-${node_version}.linux-amd64/node_exporter\"\n }\n\n # The artifact stanza instructs Nomad to fetch and unpack a remote resource,\n # such as a file, tarball, or binary. Nomad downloads artifacts using the\n # popular go-getter library, which permits downloading artifacts from a\n # variety of locations using a URL as the input source.\n #\n # https://www.nomadproject.io/docs/job-specification/artifact\n #\n artifact {\n source = \"${node_url_amd64}\"\n }\n\n # The service stanza instructs Nomad to register a service with Consul.\n #\n # https://www.nomadproject.io/docs/job-specification/service\n #\n service {\n name = \"${node_service_name}\"\n port = \"${node_service_name}\"\n check {\n name = \"Node Exporter Check Live\"\n type = \"http\"\n path = \"/metrics\"\n interval = \"10s\"\n timeout = \"2s\"\n }\n }\n\n # The \"resources\" stanza describes the requirements a task needs to\n # execute. Resource requirements include memory, network, cpu, and more.\n # This ensures the task will execute on a machine that contains enough\n # resource capacity.\n #\n # https://www.nomadproject.io/docs/job-specification/resources\n #\n resources {\n cpu = 500\n # The network stanza specifies the networking requirements for the task\n # group, including the network mode and port allocations. When scheduling\n # jobs in Nomad they are provisioned across your fleet of machines along\n # with other jobs and services. Because you don't know in advance what host\n # your job will be provisioned on, Nomad will provide your tasks with\n # network configuration when they start up.\n #\n # https://www.nomadproject.io/docs/job-specification/network\n #\n network {\n port \"${node_service_name}\" {\n static = ${node_port}\n }\n }\n }\n }\n task \"prod-task2-${blackbox_service_name}-amd64\" {\n # The \"driver\" parameter specifies the task driver that should be used to\n # run the task.\n driver = \"exec\"\n\n # The \"config\" stanza specifies the driver configuration, which is passed\n # directly to the driver to start the task. The details of configurations\n # are specific to each driver, so please see specific driver\n # documentation for more information.\n config {\n command = \"local/blackbox_exporter-${blackbox_version}.linux-amd64/blackbox_exporter\"\n args = [\n \"--config.file=secrets/blackbox.yml\"\n ]\n }\n\n # The \"template\" stanza instructs Nomad to manage a template, such as\n # a configuration file or script. This template can optionally pull data\n # from Consul or Vault to populate runtime configuration data.\n #\n # https://www.nomadproject.io/docs/job-specification/template\n #\n template {\n change_mode = \"noop\"\n change_signal = \"SIGINT\"\n destination = \"secrets/blackbox.yml\"\n data = \u003c\u003cEOH\nmodules:\n http_2xx:\n prober: http\n timeout: 5s\n http:\n valid_http_versions: [\"HTTP/1.1\", \"HTTP/2.0\"]\n no_follow_redirects: false\n fail_if_ssl: false\n fail_if_not_ssl: true\n tls_config:\n insecure_skip_verify: false\n preferred_ip_protocol: \"ip4\"\n icmp_v4:\n prober: icmp\n timeout: 5s\n icmp:\n preferred_ip_protocol: \"ip4\"\n dns_udp:\n prober: dns\n timeout: 5s\n dns:\n query_name: \"jenkins.fd.io\"\n query_type: \"A\"\n valid_rcodes:\n - NOERROR\nEOH\n }\n\n # The artifact stanza instructs Nomad to fetch and unpack a remote resource,\n # such as a file, tarball, or binary. Nomad downloads artifacts using the\n # popular go-getter library, which permits downloading artifacts from a\n # variety of locations using a URL as the input source.\n #\n # https://www.nomadproject.io/docs/job-specification/artifact\n #\n artifact {\n source = \"${blackbox_url_amd64}\"\n }\n\n # The service stanza instructs Nomad to register a service with Consul.\n #\n # https://www.nomadproject.io/docs/job-specification/service\n #\n service {\n name = \"${blackbox_service_name}\"\n port = \"${blackbox_service_name}\"\n tags = [ \"${blackbox_service_name}$${NOMAD_ALLOC_INDEX}\" ]\n check {\n name = \"Blackbox Exporter Check Live\"\n type = \"http\"\n path = \"/metrics\"\n interval = \"10s\"\n timeout = \"2s\"\n }\n }\n\n # The \"resources\" stanza describes the requirements a task needs to\n # execute. Resource requirements include memory, network, cpu, and more.\n # This ensures the task will execute on a machine that contains enough\n # resource capacity.\n #\n # https://www.nomadproject.io/docs/job-specification/resources\n #\n resources {\n cpu = 500\n # The network stanza specifies the networking requirements for the task\n # group, including the network mode and port allocations. When scheduling\n # jobs in Nomad they are provisioned across your fleet of machines along\n # with other jobs and services. Because you don't know in advance what host\n # your job will be provisioned on, Nomad will provide your tasks with\n # network configuration when they start up.\n #\n # https://www.nomadproject.io/docs/job-specification/network\n #\n network {\n port \"${blackbox_service_name}\" {\n static = ${blackbox_port}\n }\n }\n }\n }\n\n task \"prod-task3-${cadvisor_service_name}-amd64\" {\n # The \"driver\" parameter specifies the task driver that should be used to\n # run the task.\n driver = \"docker\"\n\n # The \"config\" stanza specifies the driver configuration, which is passed\n # directly to the driver to start the task. The details of configurations\n # are specific to each driver, so please see specific driver\n # documentation for more information.\n config {\n image = \"${cadvisor_image}\"\n volumes = [\n \"/:/rootfs:ro\",\n \"/var/run:/var/run:rw\",\n \"/sys:/sys:ro\",\n \"/var/lib/docker/:/var/lib/docker:ro\",\n \"/cgroup:/cgroup:ro\"\n ]\n }\n\n # The service stanza instructs Nomad to register a service with Consul.\n #\n # https://www.nomadproject.io/docs/job-specification/service\n #\n service {\n name = \"${cadvisor_service_name}\"\n port = \"${cadvisor_service_name}\"\n check {\n name = \"cAdvisor Check Live\"\n type = \"http\"\n path = \"/metrics\"\n interval = \"10s\"\n timeout = \"2s\"\n }\n }\n\n # The \"resources\" stanza describes the requirements a task needs to\n # execute. Resource requirements include memory, network, cpu, and more.\n # This ensures the task will execute on a machine that contains enough\n # resource capacity.\n #\n # https://www.nomadproject.io/docs/job-specification/resources\n #\n resources {\n cpu = 500\n # The network stanza specifies the networking requirements for the task\n # group, including the network mode and port allocations. When scheduling\n # jobs in Nomad they are provisioned across your fleet of machines along\n # with other jobs and services. Because you don't know in advance what host\n # your job will be provisioned on, Nomad will provide your tasks with\n # network configuration when they start up.\n #\n # https://www.nomadproject.io/docs/job-specification/network\n #\n network {\n port \"${cadvisor_service_name}\" {\n static = ${cadvisor_port}\n }\n }\n }\n }\n }\n\n group \"prod-group1-exporter-arm64\" {\n # The constraint allows restricting the set of eligible nodes. Constraints\n # may filter on attributes or client metadata.\n #\n # https://www.nomadproject.io/docs/job-specification/constraint\n #\n constraint {\n attribute = \"$${attr.cpu.arch}\"\n operator = \"==\"\n value = \"arm64\"\n }\n\n # The \"task\" stanza creates an individual unit of work, such as a Docker\n # container, web application, or batch processing.\n #\n # https://www.nomadproject.io/docs/job-specification/task\n #\n task \"prod-task1-${node_service_name}-arm64\" {\n # The \"driver\" parameter specifies the task driver that should be used to\n # run the task.\n driver = \"raw_exec\"\n\n # The \"config\" stanza specifies the driver configuration, which is passed\n # directly to the driver to start the task. The details of configurations\n # are specific to each driver, so please see specific driver\n # documentation for more information.\n config {\n command = \"local/node_exporter-${node_version}.linux-arm64/node_exporter\"\n }\n\n # The artifact stanza instructs Nomad to fetch and unpack a remote resource,\n # such as a file, tarball, or binary. Nomad downloads artifacts using the\n # popular go-getter library, which permits downloading artifacts from a\n # variety of locations using a URL as the input source.\n #\n # https://www.nomadproject.io/docs/job-specification/artifact\n #\n artifact {\n source = \"${node_url_arm64}\"\n }\n\n # The service stanza instructs Nomad to register a service with Consul.\n #\n # https://www.nomadproject.io/docs/job-specification/service\n #\n service {\n name = \"${node_service_name}\"\n port = \"${node_service_name}\"\n check {\n name = \"Node Exporter Check Live\"\n type = \"http\"\n path = \"/metrics\"\n interval = \"10s\"\n timeout = \"2s\"\n }\n }\n\n # The \"resources\" stanza describes the requirements a task needs to\n # execute. Resource requirements include memory, network, cpu, and more.\n # This ensures the task will execute on a machine that contains enough\n # resource capacity.\n #\n # https://www.nomadproject.io/docs/job-specification/resources\n #\n resources {\n cpu = 500\n # The network stanza specifies the networking requirements for the task\n # group, including the network mode and port allocations. When scheduling\n # jobs in Nomad they are provisioned across your fleet of machines along\n # with other jobs and services. Because you don't know in advance what host\n # your job will be provisioned on, Nomad will provide your tasks with\n # network configuration when they start up.\n #\n # https://www.nomadproject.io/docs/job-specification/network\n #\n network {\n port \"${node_service_name}\" {\n static = ${node_port}\n }\n }\n }\n }\n\n task \"prod-task2-${blackbox_service_name}-arm64\" {\n # The \"driver\" parameter specifies the task driver that should be used to\n # run the task.\n driver = \"exec\"\n\n # The \"config\" stanza specifies the driver configuration, which is passed\n # directly to the driver to start the task. The details of configurations\n # are specific to each driver, so please see specific driver\n # documentation for more information.\n config {\n command = \"local/blackbox_exporter-${blackbox_version}.linux-arm64/blackbox_exporter\"\n args = [\n \"--config.file=secrets/blackbox.yml\"\n ]\n }\n\n # The \"template\" stanza instructs Nomad to manage a template, such as\n # a configuration file or script. This template can optionally pull data\n # from Consul or Vault to populate runtime configuration data.\n #\n # https://www.nomadproject.io/docs/job-specification/template\n #\n template {\n change_mode = \"noop\"\n change_signal = \"SIGINT\"\n destination = \"secrets/blackbox.yml\"\n data = \u003c\u003cEOH\nmodules:\n http_2xx:\n prober: http\n timeout: 5s\n http:\n valid_http_versions: [\"HTTP/1.1\", \"HTTP/2.0\"]\n no_follow_redirects: false\n fail_if_ssl: false\n fail_if_not_ssl: true\n tls_config:\n insecure_skip_verify: false\n preferred_ip_protocol: \"ip4\"\n icmp_v4:\n prober: icmp\n timeout: 5s\n icmp:\n preferred_ip_protocol: \"ip4\"\n dns_udp:\n prober: dns\n timeout: 5s\n dns:\n query_name: \"jenkins.fd.io\"\n query_type: \"A\"\n valid_rcodes:\n - NOERROR\nEOH\n }\n\n # The artifact stanza instructs Nomad to fetch and unpack a remote resource,\n # such as a file, tarball, or binary. Nomad downloads artifacts using the\n # popular go-getter library, which permits downloading artifacts from a\n # variety of locations using a URL as the input source.\n #\n # https://www.nomadproject.io/docs/job-specification/artifact\n #\n artifact {\n source = \"${blackbox_url_arm64}\"\n }\n\n # The service stanza instructs Nomad to register a service with Consul.\n #\n # https://www.nomadproject.io/docs/job-specification/service\n #\n service {\n name = \"${blackbox_service_name}\"\n port = \"${blackbox_service_name}\"\n tags = [ \"${blackbox_service_name}$${NOMAD_ALLOC_INDEX}\" ]\n check {\n name = \"Blackbox Exporter Check Live\"\n type = \"http\"\n path = \"/metrics\"\n interval = \"10s\"\n timeout = \"2s\"\n }\n }\n\n # The \"resources\" stanza describes the requirements a task needs to\n # execute. Resource requirements include memory, network, cpu, and more.\n # This ensures the task will execute on a machine that contains enough\n # resource capacity.\n #\n # https://www.nomadproject.io/docs/job-specification/resources\n #\n resources {\n cpu = 500\n # The network stanza specifies the networking requirements for the task\n # group, including the network mode and port allocations. When scheduling\n # jobs in Nomad they are provisioned across your fleet of machines along\n # with other jobs and services. Because you don't know in advance what host\n # your job will be provisioned on, Nomad will provide your tasks with\n # network configuration when they start up.\n #\n # https://www.nomadproject.io/docs/job-specification/network\n #\n network {\n port \"${blackbox_service_name}\" {\n static = ${blackbox_port}\n }\n }\n }\n }\n\n task \"prod-task3-${cadvisor_service_name}-arm64\" {\n # The \"driver\" parameter specifies the task driver that should be used to\n # run the task.\n driver = \"docker\"\n\n # The \"config\" stanza specifies the driver configuration, which is passed\n # directly to the driver to start the task. The details of configurations\n # are specific to each driver, so please see specific driver\n # documentation for more information.\n config {\n # There is currently no official release for arm yet...using community.\n image = \"zcube/cadvisor:latest\"\n volumes = [\n \"/:/rootfs:ro\",\n \"/var/run:/var/run:rw\",\n \"/sys:/sys:ro\",\n \"/var/lib/docker/:/var/lib/docker:ro\",\n \"/cgroup:/cgroup:ro\"\n ]\n }\n\n # The service stanza instructs Nomad to register a service with Consul.\n #\n # https://www.nomadproject.io/docs/job-specification/service\n #\n service {\n name = \"${cadvisor_service_name}\"\n port = \"${cadvisor_service_name}\"\n check {\n name = \"cAdvisor Check Live\"\n type = \"http\"\n path = \"/metrics\"\n interval = \"10s\"\n timeout = \"2s\"\n }\n }\n\n # The \"resources\" stanza describes the requirements a task needs to\n # execute. Resource requirements include memory, network, cpu, and more.\n # This ensures the task will execute on a machine that contains enough\n # resource capacity.\n #\n # https://www.nomadproject.io/docs/job-specification/resources\n #\n resources {\n cpu = 500\n # The network stanza specifies the networking requirements for the task\n # group, including the network mode and port allocations. When scheduling\n # jobs in Nomad they are provisioned across your fleet of machines along\n # with other jobs and services. Because you don't know in advance what host\n # your job will be provisioned on, Nomad will provide your tasks with\n # network configuration when they start up.\n #\n # https://www.nomadproject.io/docs/job-specification/network\n #\n network {\n port \"${cadvisor_service_name}\" {\n static = ${cadvisor_port}\n }\n }\n }\n }\n }\n}", + "id": "04a453cfc12a27f7b6d323a51d4cde3c76479feb5ccbfdfc2b43f6d4556e4f63", + "rendered": "job \"prod-exporter\" {\n # The \"region\" parameter specifies the region in which to execute the job.\n # If omitted, this inherits the default region name of \"global\".\n # region = \"global\"\n #\n # The \"datacenters\" parameter specifies the list of datacenters which should\n # be considered when placing this task. This must be provided.\n datacenters = \"yul1\"\n\n # The \"type\" parameter controls the type of job, which impacts the scheduler's\n # decision on placement. This configuration is optional and defaults to\n # \"service\". For a full list of job types and their differences, please see\n # the online documentation.\n #\n # https://www.nomadproject.io/docs/jobspec/schedulers\n #\n type = \"system\"\n\n update {\n # The \"max_parallel\" parameter specifies the maximum number of updates to\n # perform in parallel. In this case, this specifies to update a single task\n # at a time.\n max_parallel = 1\n\n health_check = \"checks\"\n\n # The \"min_healthy_time\" parameter specifies the minimum time the allocation\n # must be in the healthy state before it is marked as healthy and unblocks\n # further allocations from being updated.\n min_healthy_time = \"10s\"\n\n # The \"healthy_deadline\" parameter specifies the deadline in which the\n # allocation must be marked as healthy after which the allocation is\n # automatically transitioned to unhealthy. Transitioning to unhealthy will\n # fail the deployment and potentially roll back the job if \"auto_revert\" is\n # set to true.\n healthy_deadline = \"3m\"\n\n # The \"progress_deadline\" parameter specifies the deadline in which an\n # allocation must be marked as healthy. The deadline begins when the first\n # allocation for the deployment is created and is reset whenever an allocation\n # as part of the deployment transitions to a healthy state. If no allocation\n # transitions to the healthy state before the progress deadline, the\n # deployment is marked as failed.\n progress_deadline = \"10m\"\n\n\n }\n\n # The \"group\" stanza defines a series of tasks that should be co-located on\n # the same Nomad client. Any task within a group will be placed on the same\n # client.\n #\n # https://www.nomadproject.io/docs/job-specification/group\n #\n group \"prod-group1-exporter-amd64\" {\n # The constraint allows restricting the set of eligible nodes. Constraints\n # may filter on attributes or client metadata.\n #\n # https://www.nomadproject.io/docs/job-specification/constraint\n #\n constraint {\n attribute = \"${attr.cpu.arch}\"\n operator = \"!=\"\n value = \"arm64\"\n }\n\n # The \"task\" stanza creates an individual unit of work, such as a Docker\n # container, web application, or batch processing.\n #\n # https://www.nomadproject.io/docs/job-specification/task\n #\n task \"prod-task3-cadvisorexporter-amd64\" {\n # The \"driver\" parameter specifies the task driver that should be used to\n # run the task.\n driver = \"docker\"\n\n # The \"config\" stanza specifies the driver configuration, which is passed\n # directly to the driver to start the task. The details of configurations\n # are specific to each driver, so please see specific driver\n # documentation for more information.\n config {\n image = \"gcr.io/cadvisor/cadvisor:latest\"\n volumes = [\n \"/:/rootfs:ro\",\n \"/var/run:/var/run:rw\",\n \"/sys:/sys:ro\",\n \"/var/lib/docker/:/var/lib/docker:ro\",\n \"/cgroup:/cgroup:ro\"\n ]\n }\n\n # The service stanza instructs Nomad to register a service with Consul.\n #\n # https://www.nomadproject.io/docs/job-specification/service\n #\n service {\n name = \"cadvisorexporter\"\n port = \"cadvisorexporter\"\n check {\n name = \"cAdvisor Check Live\"\n type = \"http\"\n path = \"/metrics\"\n interval = \"10s\"\n timeout = \"2s\"\n }\n }\n\n # The \"resources\" stanza describes the requirements a task needs to\n # execute. Resource requirements include memory, network, cpu, and more.\n # This ensures the task will execute on a machine that contains enough\n # resource capacity.\n #\n # https://www.nomadproject.io/docs/job-specification/resources\n #\n resources {\n cpu = 500\n # The network stanza specifies the networking requirements for the task\n # group, including the network mode and port allocations. When scheduling\n # jobs in Nomad they are provisioned across your fleet of machines along\n # with other jobs and services. Because you don't know in advance what host\n # your job will be provisioned on, Nomad will provide your tasks with\n # network configuration when they start up.\n #\n # https://www.nomadproject.io/docs/job-specification/network\n #\n network {\n port \"cadvisorexporter\" {\n static = 8080\n }\n }\n }\n }\n }\n\n group \"prod-group1-exporter-arm64\" {\n # The constraint allows restricting the set of eligible nodes. Constraints\n # may filter on attributes or client metadata.\n #\n # https://www.nomadproject.io/docs/job-specification/constraint\n #\n constraint {\n attribute = \"${attr.cpu.arch}\"\n operator = \"==\"\n value = \"arm64\"\n }\n\n # The \"task\" stanza creates an individual unit of work, such as a Docker\n # container, web application, or batch processing.\n #\n # https://www.nomadproject.io/docs/job-specification/task\n #\n task \"prod-task3-cadvisorexporter-arm64\" {\n # The \"driver\" parameter specifies the task driver that should be used to\n # run the task.\n driver = \"docker\"\n\n # The \"config\" stanza specifies the driver configuration, which is passed\n # directly to the driver to start the task. The details of configurations\n # are specific to each driver, so please see specific driver\n # documentation for more information.\n config {\n # There is currently no official release for arm yet...using community.\n image = \"zcube/cadvisor:latest\"\n volumes = [\n \"/:/rootfs:ro\",\n \"/var/run:/var/run:rw\",\n \"/sys:/sys:ro\",\n \"/var/lib/docker/:/var/lib/docker:ro\",\n \"/cgroup:/cgroup:ro\"\n ]\n }\n\n # The service stanza instructs Nomad to register a service with Consul.\n #\n # https://www.nomadproject.io/docs/job-specification/service\n #\n service {\n name = \"cadvisorexporter\"\n port = \"cadvisorexporter\"\n check {\n name = \"cAdvisor Check Live\"\n type = \"http\"\n path = \"/metrics\"\n interval = \"10s\"\n timeout = \"2s\"\n }\n }\n\n # The \"resources\" stanza describes the requirements a task needs to\n # execute. Resource requirements include memory, network, cpu, and more.\n # This ensures the task will execute on a machine that contains enough\n # resource capacity.\n #\n # https://www.nomadproject.io/docs/job-specification/resources\n #\n resources {\n cpu = 500\n # The network stanza specifies the networking requirements for the task\n # group, including the network mode and port allocations. When scheduling\n # jobs in Nomad they are provisioned across your fleet of machines along\n # with other jobs and services. Because you don't know in advance what host\n # your job will be provisioned on, Nomad will provide your tasks with\n # network configuration when they start up.\n #\n # https://www.nomadproject.io/docs/job-specification/network\n #\n network {\n port \"cadvisorexporter\" {\n static = 8080\n }\n }\n }\n }\n }\n}", + "template": "job \"${job_name}\" {\n # The \"region\" parameter specifies the region in which to execute the job.\n # If omitted, this inherits the default region name of \"global\".\n # region = \"global\"\n #\n # The \"datacenters\" parameter specifies the list of datacenters which should\n # be considered when placing this task. This must be provided.\n datacenters = \"${datacenters}\"\n\n # The \"type\" parameter controls the type of job, which impacts the scheduler's\n # decision on placement. This configuration is optional and defaults to\n # \"service\". For a full list of job types and their differences, please see\n # the online documentation.\n #\n # https://www.nomadproject.io/docs/jobspec/schedulers\n #\n type = \"system\"\n\n update {\n # The \"max_parallel\" parameter specifies the maximum number of updates to\n # perform in parallel. In this case, this specifies to update a single task\n # at a time.\n max_parallel = 1\n\n health_check = \"checks\"\n\n # The \"min_healthy_time\" parameter specifies the minimum time the allocation\n # must be in the healthy state before it is marked as healthy and unblocks\n # further allocations from being updated.\n min_healthy_time = \"10s\"\n\n # The \"healthy_deadline\" parameter specifies the deadline in which the\n # allocation must be marked as healthy after which the allocation is\n # automatically transitioned to unhealthy. Transitioning to unhealthy will\n # fail the deployment and potentially roll back the job if \"auto_revert\" is\n # set to true.\n healthy_deadline = \"3m\"\n\n # The \"progress_deadline\" parameter specifies the deadline in which an\n # allocation must be marked as healthy. The deadline begins when the first\n # allocation for the deployment is created and is reset whenever an allocation\n # as part of the deployment transitions to a healthy state. If no allocation\n # transitions to the healthy state before the progress deadline, the\n # deployment is marked as failed.\n progress_deadline = \"10m\"\n\n%{ if use_canary }\n # The \"canary\" parameter specifies that changes to the job that would result\n # in destructive updates should create the specified number of canaries\n # without stopping any previous allocations. Once the operator determines the\n # canaries are healthy, they can be promoted which unblocks a rolling update\n # of the remaining allocations at a rate of \"max_parallel\".\n #\n # Further, setting \"canary\" equal to the count of the task group allows\n # blue/green deployments. When the job is updated, a full set of the new\n # version is deployed and upon promotion the old version is stopped.\n canary = 1\n\n # Specifies if the job should auto-promote to the canary version when all\n # canaries become healthy during a deployment. Defaults to false which means\n # canaries must be manually updated with the nomad deployment promote\n # command.\n auto_promote = true\n\n # The \"auto_revert\" parameter specifies if the job should auto-revert to the\n # last stable job on deployment failure. A job is marked as stable if all the\n # allocations as part of its deployment were marked healthy.\n auto_revert = true\n%{ endif }\n }\n\n # The \"group\" stanza defines a series of tasks that should be co-located on\n # the same Nomad client. Any task within a group will be placed on the same\n # client.\n #\n # https://www.nomadproject.io/docs/job-specification/group\n #\n group \"prod-group1-exporter-amd64\" {\n # The constraint allows restricting the set of eligible nodes. Constraints\n # may filter on attributes or client metadata.\n #\n # https://www.nomadproject.io/docs/job-specification/constraint\n #\n constraint {\n attribute = \"$${attr.cpu.arch}\"\n operator = \"!=\"\n value = \"arm64\"\n }\n\n # The \"task\" stanza creates an individual unit of work, such as a Docker\n # container, web application, or batch processing.\n #\n # https://www.nomadproject.io/docs/job-specification/task\n #\n task \"prod-task3-${cadvisor_service_name}-amd64\" {\n # The \"driver\" parameter specifies the task driver that should be used to\n # run the task.\n driver = \"docker\"\n\n # The \"config\" stanza specifies the driver configuration, which is passed\n # directly to the driver to start the task. The details of configurations\n # are specific to each driver, so please see specific driver\n # documentation for more information.\n config {\n image = \"${cadvisor_image}\"\n volumes = [\n \"/:/rootfs:ro\",\n \"/var/run:/var/run:rw\",\n \"/sys:/sys:ro\",\n \"/var/lib/docker/:/var/lib/docker:ro\",\n \"/cgroup:/cgroup:ro\"\n ]\n }\n\n # The service stanza instructs Nomad to register a service with Consul.\n #\n # https://www.nomadproject.io/docs/job-specification/service\n #\n service {\n name = \"${cadvisor_service_name}\"\n port = \"${cadvisor_service_name}\"\n check {\n name = \"cAdvisor Check Live\"\n type = \"http\"\n path = \"/metrics\"\n interval = \"10s\"\n timeout = \"2s\"\n }\n }\n\n # The \"resources\" stanza describes the requirements a task needs to\n # execute. Resource requirements include memory, network, cpu, and more.\n # This ensures the task will execute on a machine that contains enough\n # resource capacity.\n #\n # https://www.nomadproject.io/docs/job-specification/resources\n #\n resources {\n cpu = 500\n # The network stanza specifies the networking requirements for the task\n # group, including the network mode and port allocations. When scheduling\n # jobs in Nomad they are provisioned across your fleet of machines along\n # with other jobs and services. Because you don't know in advance what host\n # your job will be provisioned on, Nomad will provide your tasks with\n # network configuration when they start up.\n #\n # https://www.nomadproject.io/docs/job-specification/network\n #\n network {\n port \"${cadvisor_service_name}\" {\n static = ${cadvisor_port}\n }\n }\n }\n }\n }\n\n group \"prod-group1-exporter-arm64\" {\n # The constraint allows restricting the set of eligible nodes. Constraints\n # may filter on attributes or client metadata.\n #\n # https://www.nomadproject.io/docs/job-specification/constraint\n #\n constraint {\n attribute = \"$${attr.cpu.arch}\"\n operator = \"==\"\n value = \"arm64\"\n }\n\n # The \"task\" stanza creates an individual unit of work, such as a Docker\n # container, web application, or batch processing.\n #\n # https://www.nomadproject.io/docs/job-specification/task\n #\n task \"prod-task3-${cadvisor_service_name}-arm64\" {\n # The \"driver\" parameter specifies the task driver that should be used to\n # run the task.\n driver = \"docker\"\n\n # The \"config\" stanza specifies the driver configuration, which is passed\n # directly to the driver to start the task. The details of configurations\n # are specific to each driver, so please see specific driver\n # documentation for more information.\n config {\n # There is currently no official release for arm yet...using community.\n image = \"zcube/cadvisor:latest\"\n volumes = [\n \"/:/rootfs:ro\",\n \"/var/run:/var/run:rw\",\n \"/sys:/sys:ro\",\n \"/var/lib/docker/:/var/lib/docker:ro\",\n \"/cgroup:/cgroup:ro\"\n ]\n }\n\n # The service stanza instructs Nomad to register a service with Consul.\n #\n # https://www.nomadproject.io/docs/job-specification/service\n #\n service {\n name = \"${cadvisor_service_name}\"\n port = \"${cadvisor_service_name}\"\n check {\n name = \"cAdvisor Check Live\"\n type = \"http\"\n path = \"/metrics\"\n interval = \"10s\"\n timeout = \"2s\"\n }\n }\n\n # The \"resources\" stanza describes the requirements a task needs to\n # execute. Resource requirements include memory, network, cpu, and more.\n # This ensures the task will execute on a machine that contains enough\n # resource capacity.\n #\n # https://www.nomadproject.io/docs/job-specification/resources\n #\n resources {\n cpu = 500\n # The network stanza specifies the networking requirements for the task\n # group, including the network mode and port allocations. When scheduling\n # jobs in Nomad they are provisioned across your fleet of machines along\n # with other jobs and services. Because you don't know in advance what host\n # your job will be provisioned on, Nomad will provide your tasks with\n # network configuration when they start up.\n #\n # https://www.nomadproject.io/docs/job-specification/network\n #\n network {\n port \"${cadvisor_service_name}\" {\n static = ${cadvisor_port}\n }\n }\n }\n }\n }\n}", "vars": { "blackbox_port": "9115", "blackbox_service_name": "blackboxexporter", @@ -143,28 +143,7 @@ { "schema_version": 0, "attributes": { - "allocation_ids": [ - "e11d14cf-0b12-7f11-f982-e75abb2fa245", - "be1d7792-8b38-0769-1899-4d78ac17d775", - "33190f8d-6b97-5a79-32c3-c2a48eb33cab", - "7b9a89f3-3094-73aa-de9e-b674cdf3962b", - "1b730294-e8c6-d330-72a6-7502ef6d8195", - "3b3932ce-d57e-d0c6-0d36-b32724d83a0f", - "d07855d6-29a8-dba5-92a8-bed5673c3b81", - "4ea7fa69-4819-5050-37ea-bbce995e994d", - "2c694f1a-fb0e-67b9-407a-dd5f1d72daa0", - "52e828d4-79a7-9e43-2cdb-1ba6088f3257", - "e23fc3eb-1699-e1e8-629b-82a21b72ecf4", - "6f83b8c0-4313-eb5a-9c22-e8db2226e832", - "15ee749c-07f0-2864-f907-919faa62cee2", - "12025e7c-4a27-9a9b-b0aa-a205179139eb", - "a6234b96-8c7b-1fd4-e85f-d361774d3005", - "fadf8b49-0e1c-98ea-42b5-119667fa092e", - "2cb8905c-3d36-878d-917e-493f8a982970", - "a1426c7b-a4a1-0f19-cc11-162b396e9ce9", - "f24eee7b-e480-3386-9a6e-357edb9ae137", - "7b7c32b2-9cc9-bb99-b203-2842e4495f5c" - ], + "allocation_ids": null, "datacenters": [ "yul1" ], @@ -174,9 +153,9 @@ "deregister_on_id_change": true, "detach": false, "id": "prod-exporter", - "jobspec": "job \"prod-exporter\" {\n # The \"region\" parameter specifies the region in which to execute the job.\n # If omitted, this inherits the default region name of \"global\".\n # region = \"global\"\n #\n # The \"datacenters\" parameter specifies the list of datacenters which should\n # be considered when placing this task. This must be provided.\n datacenters = \"yul1\"\n\n # The \"type\" parameter controls the type of job, which impacts the scheduler's\n # decision on placement. This configuration is optional and defaults to\n # \"service\". For a full list of job types and their differences, please see\n # the online documentation.\n #\n # https://www.nomadproject.io/docs/jobspec/schedulers\n #\n type = \"system\"\n\n update {\n # The \"max_parallel\" parameter specifies the maximum number of updates to\n # perform in parallel. In this case, this specifies to update a single task\n # at a time.\n max_parallel = 1\n\n health_check = \"checks\"\n\n # The \"min_healthy_time\" parameter specifies the minimum time the allocation\n # must be in the healthy state before it is marked as healthy and unblocks\n # further allocations from being updated.\n min_healthy_time = \"10s\"\n\n # The \"healthy_deadline\" parameter specifies the deadline in which the\n # allocation must be marked as healthy after which the allocation is\n # automatically transitioned to unhealthy. Transitioning to unhealthy will\n # fail the deployment and potentially roll back the job if \"auto_revert\" is\n # set to true.\n healthy_deadline = \"3m\"\n\n # The \"progress_deadline\" parameter specifies the deadline in which an\n # allocation must be marked as healthy. The deadline begins when the first\n # allocation for the deployment is created and is reset whenever an allocation\n # as part of the deployment transitions to a healthy state. If no allocation\n # transitions to the healthy state before the progress deadline, the\n # deployment is marked as failed.\n progress_deadline = \"10m\"\n\n\n }\n\n # The \"group\" stanza defines a series of tasks that should be co-located on\n # the same Nomad client. Any task within a group will be placed on the same\n # client.\n #\n # https://www.nomadproject.io/docs/job-specification/group\n #\n group \"prod-group1-exporter-amd64\" {\n # The constraint allows restricting the set of eligible nodes. Constraints\n # may filter on attributes or client metadata.\n #\n # https://www.nomadproject.io/docs/job-specification/constraint\n #\n constraint {\n attribute = \"${attr.cpu.arch}\"\n operator = \"!=\"\n value = \"arm64\"\n }\n\n # The \"task\" stanza creates an individual unit of work, such as a Docker\n # container, web application, or batch processing.\n #\n # https://www.nomadproject.io/docs/job-specification/task\n #\n task \"prod-task1-nodeexporter-amd64\" {\n # The \"driver\" parameter specifies the task driver that should be used to\n # run the task.\n driver = \"raw_exec\"\n\n # The \"config\" stanza specifies the driver configuration, which is passed\n # directly to the driver to start the task. The details of configurations\n # are specific to each driver, so please see specific driver\n # documentation for more information.\n config {\n command = \"local/node_exporter-1.0.1.linux-amd64/node_exporter\"\n }\n\n # The artifact stanza instructs Nomad to fetch and unpack a remote resource,\n # such as a file, tarball, or binary. Nomad downloads artifacts using the\n # popular go-getter library, which permits downloading artifacts from a\n # variety of locations using a URL as the input source.\n #\n # https://www.nomadproject.io/docs/job-specification/artifact\n #\n artifact {\n source = \"https://github.com/prometheus/node_exporter/releases/download/v1.0.1/node_exporter-1.0.1.linux-amd64.tar.gz\"\n }\n\n # The service stanza instructs Nomad to register a service with Consul.\n #\n # https://www.nomadproject.io/docs/job-specification/service\n #\n service {\n name = \"nodeexporter\"\n port = \"nodeexporter\"\n check {\n name = \"Node Exporter Check Live\"\n type = \"http\"\n path = \"/metrics\"\n interval = \"10s\"\n timeout = \"2s\"\n }\n }\n\n # The \"resources\" stanza describes the requirements a task needs to\n # execute. Resource requirements include memory, network, cpu, and more.\n # This ensures the task will execute on a machine that contains enough\n # resource capacity.\n #\n # https://www.nomadproject.io/docs/job-specification/resources\n #\n resources {\n cpu = 500\n # The network stanza specifies the networking requirements for the task\n # group, including the network mode and port allocations. When scheduling\n # jobs in Nomad they are provisioned across your fleet of machines along\n # with other jobs and services. Because you don't know in advance what host\n # your job will be provisioned on, Nomad will provide your tasks with\n # network configuration when they start up.\n #\n # https://www.nomadproject.io/docs/job-specification/network\n #\n network {\n port \"nodeexporter\" {\n static = 9100\n }\n }\n }\n }\n task \"prod-task2-blackboxexporter-amd64\" {\n # The \"driver\" parameter specifies the task driver that should be used to\n # run the task.\n driver = \"exec\"\n\n # The \"config\" stanza specifies the driver configuration, which is passed\n # directly to the driver to start the task. The details of configurations\n # are specific to each driver, so please see specific driver\n # documentation for more information.\n config {\n command = \"local/blackbox_exporter-0.18.0.linux-amd64/blackbox_exporter\"\n args = [\n \"--config.file=secrets/blackbox.yml\"\n ]\n }\n\n # The \"template\" stanza instructs Nomad to manage a template, such as\n # a configuration file or script. This template can optionally pull data\n # from Consul or Vault to populate runtime configuration data.\n #\n # https://www.nomadproject.io/docs/job-specification/template\n #\n template {\n change_mode = \"noop\"\n change_signal = \"SIGINT\"\n destination = \"secrets/blackbox.yml\"\n data = \u003c\u003cEOH\nmodules:\n http_2xx:\n prober: http\n timeout: 5s\n http:\n valid_http_versions: [\"HTTP/1.1\", \"HTTP/2.0\"]\n no_follow_redirects: false\n fail_if_ssl: false\n fail_if_not_ssl: true\n tls_config:\n insecure_skip_verify: false\n preferred_ip_protocol: \"ip4\"\n icmp_v4:\n prober: icmp\n timeout: 5s\n icmp:\n preferred_ip_protocol: \"ip4\"\n dns_udp:\n prober: dns\n timeout: 5s\n dns:\n query_name: \"jenkins.fd.io\"\n query_type: \"A\"\n valid_rcodes:\n - NOERROR\nEOH\n }\n\n # The artifact stanza instructs Nomad to fetch and unpack a remote resource,\n # such as a file, tarball, or binary. Nomad downloads artifacts using the\n # popular go-getter library, which permits downloading artifacts from a\n # variety of locations using a URL as the input source.\n #\n # https://www.nomadproject.io/docs/job-specification/artifact\n #\n artifact {\n source = \"https://github.com/prometheus/blackbox_exporter/releases/download/v0.18.0/blackbox_exporter-0.18.0.linux-amd64.tar.gz\"\n }\n\n # The service stanza instructs Nomad to register a service with Consul.\n #\n # https://www.nomadproject.io/docs/job-specification/service\n #\n service {\n name = \"blackboxexporter\"\n port = \"blackboxexporter\"\n tags = [ \"blackboxexporter${NOMAD_ALLOC_INDEX}\" ]\n check {\n name = \"Blackbox Exporter Check Live\"\n type = \"http\"\n path = \"/metrics\"\n interval = \"10s\"\n timeout = \"2s\"\n }\n }\n\n # The \"resources\" stanza describes the requirements a task needs to\n # execute. Resource requirements include memory, network, cpu, and more.\n # This ensures the task will execute on a machine that contains enough\n # resource capacity.\n #\n # https://www.nomadproject.io/docs/job-specification/resources\n #\n resources {\n cpu = 500\n # The network stanza specifies the networking requirements for the task\n # group, including the network mode and port allocations. When scheduling\n # jobs in Nomad they are provisioned across your fleet of machines along\n # with other jobs and services. Because you don't know in advance what host\n # your job will be provisioned on, Nomad will provide your tasks with\n # network configuration when they start up.\n #\n # https://www.nomadproject.io/docs/job-specification/network\n #\n network {\n port \"blackboxexporter\" {\n static = 9115\n }\n }\n }\n }\n\n task \"prod-task3-cadvisorexporter-amd64\" {\n # The \"driver\" parameter specifies the task driver that should be used to\n # run the task.\n driver = \"docker\"\n\n # The \"config\" stanza specifies the driver configuration, which is passed\n # directly to the driver to start the task. The details of configurations\n # are specific to each driver, so please see specific driver\n # documentation for more information.\n config {\n image = \"gcr.io/cadvisor/cadvisor:latest\"\n volumes = [\n \"/:/rootfs:ro\",\n \"/var/run:/var/run:rw\",\n \"/sys:/sys:ro\",\n \"/var/lib/docker/:/var/lib/docker:ro\",\n \"/cgroup:/cgroup:ro\"\n ]\n }\n\n # The service stanza instructs Nomad to register a service with Consul.\n #\n # https://www.nomadproject.io/docs/job-specification/service\n #\n service {\n name = \"cadvisorexporter\"\n port = \"cadvisorexporter\"\n check {\n name = \"cAdvisor Check Live\"\n type = \"http\"\n path = \"/metrics\"\n interval = \"10s\"\n timeout = \"2s\"\n }\n }\n\n # The \"resources\" stanza describes the requirements a task needs to\n # execute. Resource requirements include memory, network, cpu, and more.\n # This ensures the task will execute on a machine that contains enough\n # resource capacity.\n #\n # https://www.nomadproject.io/docs/job-specification/resources\n #\n resources {\n cpu = 500\n # The network stanza specifies the networking requirements for the task\n # group, including the network mode and port allocations. When scheduling\n # jobs in Nomad they are provisioned across your fleet of machines along\n # with other jobs and services. Because you don't know in advance what host\n # your job will be provisioned on, Nomad will provide your tasks with\n # network configuration when they start up.\n #\n # https://www.nomadproject.io/docs/job-specification/network\n #\n network {\n port \"cadvisorexporter\" {\n static = 8080\n }\n }\n }\n }\n }\n\n group \"prod-group1-exporter-arm64\" {\n # The constraint allows restricting the set of eligible nodes. Constraints\n # may filter on attributes or client metadata.\n #\n # https://www.nomadproject.io/docs/job-specification/constraint\n #\n constraint {\n attribute = \"${attr.cpu.arch}\"\n operator = \"==\"\n value = \"arm64\"\n }\n\n # The \"task\" stanza creates an individual unit of work, such as a Docker\n # container, web application, or batch processing.\n #\n # https://www.nomadproject.io/docs/job-specification/task\n #\n task \"prod-task1-nodeexporter-arm64\" {\n # The \"driver\" parameter specifies the task driver that should be used to\n # run the task.\n driver = \"raw_exec\"\n\n # The \"config\" stanza specifies the driver configuration, which is passed\n # directly to the driver to start the task. The details of configurations\n # are specific to each driver, so please see specific driver\n # documentation for more information.\n config {\n command = \"local/node_exporter-1.0.1.linux-arm64/node_exporter\"\n }\n\n # The artifact stanza instructs Nomad to fetch and unpack a remote resource,\n # such as a file, tarball, or binary. Nomad downloads artifacts using the\n # popular go-getter library, which permits downloading artifacts from a\n # variety of locations using a URL as the input source.\n #\n # https://www.nomadproject.io/docs/job-specification/artifact\n #\n artifact {\n source = \"https://github.com/prometheus/node_exporter/releases/download/v1.0.1/node_exporter-1.0.1.linux-arm64.tar.gz\"\n }\n\n # The service stanza instructs Nomad to register a service with Consul.\n #\n # https://www.nomadproject.io/docs/job-specification/service\n #\n service {\n name = \"nodeexporter\"\n port = \"nodeexporter\"\n check {\n name = \"Node Exporter Check Live\"\n type = \"http\"\n path = \"/metrics\"\n interval = \"10s\"\n timeout = \"2s\"\n }\n }\n\n # The \"resources\" stanza describes the requirements a task needs to\n # execute. Resource requirements include memory, network, cpu, and more.\n # This ensures the task will execute on a machine that contains enough\n # resource capacity.\n #\n # https://www.nomadproject.io/docs/job-specification/resources\n #\n resources {\n cpu = 500\n # The network stanza specifies the networking requirements for the task\n # group, including the network mode and port allocations. When scheduling\n # jobs in Nomad they are provisioned across your fleet of machines along\n # with other jobs and services. Because you don't know in advance what host\n # your job will be provisioned on, Nomad will provide your tasks with\n # network configuration when they start up.\n #\n # https://www.nomadproject.io/docs/job-specification/network\n #\n network {\n port \"nodeexporter\" {\n static = 9100\n }\n }\n }\n }\n\n task \"prod-task2-blackboxexporter-arm64\" {\n # The \"driver\" parameter specifies the task driver that should be used to\n # run the task.\n driver = \"exec\"\n\n # The \"config\" stanza specifies the driver configuration, which is passed\n # directly to the driver to start the task. The details of configurations\n # are specific to each driver, so please see specific driver\n # documentation for more information.\n config {\n command = \"local/blackbox_exporter-0.18.0.linux-arm64/blackbox_exporter\"\n args = [\n \"--config.file=secrets/blackbox.yml\"\n ]\n }\n\n # The \"template\" stanza instructs Nomad to manage a template, such as\n # a configuration file or script. This template can optionally pull data\n # from Consul or Vault to populate runtime configuration data.\n #\n # https://www.nomadproject.io/docs/job-specification/template\n #\n template {\n change_mode = \"noop\"\n change_signal = \"SIGINT\"\n destination = \"secrets/blackbox.yml\"\n data = \u003c\u003cEOH\nmodules:\n http_2xx:\n prober: http\n timeout: 5s\n http:\n valid_http_versions: [\"HTTP/1.1\", \"HTTP/2.0\"]\n no_follow_redirects: false\n fail_if_ssl: false\n fail_if_not_ssl: true\n tls_config:\n insecure_skip_verify: false\n preferred_ip_protocol: \"ip4\"\n icmp_v4:\n prober: icmp\n timeout: 5s\n icmp:\n preferred_ip_protocol: \"ip4\"\n dns_udp:\n prober: dns\n timeout: 5s\n dns:\n query_name: \"jenkins.fd.io\"\n query_type: \"A\"\n valid_rcodes:\n - NOERROR\nEOH\n }\n\n # The artifact stanza instructs Nomad to fetch and unpack a remote resource,\n # such as a file, tarball, or binary. Nomad downloads artifacts using the\n # popular go-getter library, which permits downloading artifacts from a\n # variety of locations using a URL as the input source.\n #\n # https://www.nomadproject.io/docs/job-specification/artifact\n #\n artifact {\n source = \"https://github.com/prometheus/blackbox_exporter/releases/download/v0.18.0/blackbox_exporter-0.18.0.linux-arm64.tar.gz\"\n }\n\n # The service stanza instructs Nomad to register a service with Consul.\n #\n # https://www.nomadproject.io/docs/job-specification/service\n #\n service {\n name = \"blackboxexporter\"\n port = \"blackboxexporter\"\n tags = [ \"blackboxexporter${NOMAD_ALLOC_INDEX}\" ]\n check {\n name = \"Blackbox Exporter Check Live\"\n type = \"http\"\n path = \"/metrics\"\n interval = \"10s\"\n timeout = \"2s\"\n }\n }\n\n # The \"resources\" stanza describes the requirements a task needs to\n # execute. Resource requirements include memory, network, cpu, and more.\n # This ensures the task will execute on a machine that contains enough\n # resource capacity.\n #\n # https://www.nomadproject.io/docs/job-specification/resources\n #\n resources {\n cpu = 500\n # The network stanza specifies the networking requirements for the task\n # group, including the network mode and port allocations. When scheduling\n # jobs in Nomad they are provisioned across your fleet of machines along\n # with other jobs and services. Because you don't know in advance what host\n # your job will be provisioned on, Nomad will provide your tasks with\n # network configuration when they start up.\n #\n # https://www.nomadproject.io/docs/job-specification/network\n #\n network {\n port \"blackboxexporter\" {\n static = 9115\n }\n }\n }\n }\n\n task \"prod-task3-cadvisorexporter-arm64\" {\n # The \"driver\" parameter specifies the task driver that should be used to\n # run the task.\n driver = \"docker\"\n\n # The \"config\" stanza specifies the driver configuration, which is passed\n # directly to the driver to start the task. The details of configurations\n # are specific to each driver, so please see specific driver\n # documentation for more information.\n config {\n # There is currently no official release for arm yet...using community.\n image = \"zcube/cadvisor:latest\"\n volumes = [\n \"/:/rootfs:ro\",\n \"/var/run:/var/run:rw\",\n \"/sys:/sys:ro\",\n \"/var/lib/docker/:/var/lib/docker:ro\",\n \"/cgroup:/cgroup:ro\"\n ]\n }\n\n # The service stanza instructs Nomad to register a service with Consul.\n #\n # https://www.nomadproject.io/docs/job-specification/service\n #\n service {\n name = \"cadvisorexporter\"\n port = \"cadvisorexporter\"\n check {\n name = \"cAdvisor Check Live\"\n type = \"http\"\n path = \"/metrics\"\n interval = \"10s\"\n timeout = \"2s\"\n }\n }\n\n # The \"resources\" stanza describes the requirements a task needs to\n # execute. Resource requirements include memory, network, cpu, and more.\n # This ensures the task will execute on a machine that contains enough\n # resource capacity.\n #\n # https://www.nomadproject.io/docs/job-specification/resources\n #\n resources {\n cpu = 500\n # The network stanza specifies the networking requirements for the task\n # group, including the network mode and port allocations. When scheduling\n # jobs in Nomad they are provisioned across your fleet of machines along\n # with other jobs and services. Because you don't know in advance what host\n # your job will be provisioned on, Nomad will provide your tasks with\n # network configuration when they start up.\n #\n # https://www.nomadproject.io/docs/job-specification/network\n #\n network {\n port \"cadvisorexporter\" {\n static = 8080\n }\n }\n }\n }\n }\n}", + "jobspec": "job \"prod-exporter\" {\n # The \"region\" parameter specifies the region in which to execute the job.\n # If omitted, this inherits the default region name of \"global\".\n # region = \"global\"\n #\n # The \"datacenters\" parameter specifies the list of datacenters which should\n # be considered when placing this task. This must be provided.\n datacenters = \"yul1\"\n\n # The \"type\" parameter controls the type of job, which impacts the scheduler's\n # decision on placement. This configuration is optional and defaults to\n # \"service\". For a full list of job types and their differences, please see\n # the online documentation.\n #\n # https://www.nomadproject.io/docs/jobspec/schedulers\n #\n type = \"system\"\n\n update {\n # The \"max_parallel\" parameter specifies the maximum number of updates to\n # perform in parallel. In this case, this specifies to update a single task\n # at a time.\n max_parallel = 1\n\n health_check = \"checks\"\n\n # The \"min_healthy_time\" parameter specifies the minimum time the allocation\n # must be in the healthy state before it is marked as healthy and unblocks\n # further allocations from being updated.\n min_healthy_time = \"10s\"\n\n # The \"healthy_deadline\" parameter specifies the deadline in which the\n # allocation must be marked as healthy after which the allocation is\n # automatically transitioned to unhealthy. Transitioning to unhealthy will\n # fail the deployment and potentially roll back the job if \"auto_revert\" is\n # set to true.\n healthy_deadline = \"3m\"\n\n # The \"progress_deadline\" parameter specifies the deadline in which an\n # allocation must be marked as healthy. The deadline begins when the first\n # allocation for the deployment is created and is reset whenever an allocation\n # as part of the deployment transitions to a healthy state. If no allocation\n # transitions to the healthy state before the progress deadline, the\n # deployment is marked as failed.\n progress_deadline = \"10m\"\n\n\n }\n\n # The \"group\" stanza defines a series of tasks that should be co-located on\n # the same Nomad client. Any task within a group will be placed on the same\n # client.\n #\n # https://www.nomadproject.io/docs/job-specification/group\n #\n group \"prod-group1-exporter-amd64\" {\n # The constraint allows restricting the set of eligible nodes. Constraints\n # may filter on attributes or client metadata.\n #\n # https://www.nomadproject.io/docs/job-specification/constraint\n #\n constraint {\n attribute = \"${attr.cpu.arch}\"\n operator = \"!=\"\n value = \"arm64\"\n }\n\n # The \"task\" stanza creates an individual unit of work, such as a Docker\n # container, web application, or batch processing.\n #\n # https://www.nomadproject.io/docs/job-specification/task\n #\n task \"prod-task3-cadvisorexporter-amd64\" {\n # The \"driver\" parameter specifies the task driver that should be used to\n # run the task.\n driver = \"docker\"\n\n # The \"config\" stanza specifies the driver configuration, which is passed\n # directly to the driver to start the task. The details of configurations\n # are specific to each driver, so please see specific driver\n # documentation for more information.\n config {\n image = \"gcr.io/cadvisor/cadvisor:latest\"\n volumes = [\n \"/:/rootfs:ro\",\n \"/var/run:/var/run:rw\",\n \"/sys:/sys:ro\",\n \"/var/lib/docker/:/var/lib/docker:ro\",\n \"/cgroup:/cgroup:ro\"\n ]\n }\n\n # The service stanza instructs Nomad to register a service with Consul.\n #\n # https://www.nomadproject.io/docs/job-specification/service\n #\n service {\n name = \"cadvisorexporter\"\n port = \"cadvisorexporter\"\n check {\n name = \"cAdvisor Check Live\"\n type = \"http\"\n path = \"/metrics\"\n interval = \"10s\"\n timeout = \"2s\"\n }\n }\n\n # The \"resources\" stanza describes the requirements a task needs to\n # execute. Resource requirements include memory, network, cpu, and more.\n # This ensures the task will execute on a machine that contains enough\n # resource capacity.\n #\n # https://www.nomadproject.io/docs/job-specification/resources\n #\n resources {\n cpu = 500\n # The network stanza specifies the networking requirements for the task\n # group, including the network mode and port allocations. When scheduling\n # jobs in Nomad they are provisioned across your fleet of machines along\n # with other jobs and services. Because you don't know in advance what host\n # your job will be provisioned on, Nomad will provide your tasks with\n # network configuration when they start up.\n #\n # https://www.nomadproject.io/docs/job-specification/network\n #\n network {\n port \"cadvisorexporter\" {\n static = 8080\n }\n }\n }\n }\n }\n\n group \"prod-group1-exporter-arm64\" {\n # The constraint allows restricting the set of eligible nodes. Constraints\n # may filter on attributes or client metadata.\n #\n # https://www.nomadproject.io/docs/job-specification/constraint\n #\n constraint {\n attribute = \"${attr.cpu.arch}\"\n operator = \"==\"\n value = \"arm64\"\n }\n\n # The \"task\" stanza creates an individual unit of work, such as a Docker\n # container, web application, or batch processing.\n #\n # https://www.nomadproject.io/docs/job-specification/task\n #\n task \"prod-task3-cadvisorexporter-arm64\" {\n # The \"driver\" parameter specifies the task driver that should be used to\n # run the task.\n driver = \"docker\"\n\n # The \"config\" stanza specifies the driver configuration, which is passed\n # directly to the driver to start the task. The details of configurations\n # are specific to each driver, so please see specific driver\n # documentation for more information.\n config {\n # There is currently no official release for arm yet...using community.\n image = \"zcube/cadvisor:latest\"\n volumes = [\n \"/:/rootfs:ro\",\n \"/var/run:/var/run:rw\",\n \"/sys:/sys:ro\",\n \"/var/lib/docker/:/var/lib/docker:ro\",\n \"/cgroup:/cgroup:ro\"\n ]\n }\n\n # The service stanza instructs Nomad to register a service with Consul.\n #\n # https://www.nomadproject.io/docs/job-specification/service\n #\n service {\n name = \"cadvisorexporter\"\n port = \"cadvisorexporter\"\n check {\n name = \"cAdvisor Check Live\"\n type = \"http\"\n path = \"/metrics\"\n interval = \"10s\"\n timeout = \"2s\"\n }\n }\n\n # The \"resources\" stanza describes the requirements a task needs to\n # execute. Resource requirements include memory, network, cpu, and more.\n # This ensures the task will execute on a machine that contains enough\n # resource capacity.\n #\n # https://www.nomadproject.io/docs/job-specification/resources\n #\n resources {\n cpu = 500\n # The network stanza specifies the networking requirements for the task\n # group, including the network mode and port allocations. When scheduling\n # jobs in Nomad they are provisioned across your fleet of machines along\n # with other jobs and services. Because you don't know in advance what host\n # your job will be provisioned on, Nomad will provide your tasks with\n # network configuration when they start up.\n #\n # https://www.nomadproject.io/docs/job-specification/network\n #\n network {\n port \"cadvisorexporter\" {\n static = 8080\n }\n }\n }\n }\n }\n}", "json": null, - "modify_index": "7147927", + "modify_index": "7201184", "name": "prod-exporter", "namespace": "default", "policy_override": null, @@ -189,18 +168,6 @@ "name": "prod-group1-exporter-amd64", "task": [ { - "driver": "raw_exec", - "meta": {}, - "name": "prod-task1-nodeexporter-amd64", - "volume_mounts": [] - }, - { - "driver": "exec", - "meta": {}, - "name": "prod-task2-blackboxexporter-amd64", - "volume_mounts": [] - }, - { "driver": "docker", "meta": {}, "name": "prod-task3-cadvisorexporter-amd64", @@ -215,18 +182,6 @@ "name": "prod-group1-exporter-arm64", "task": [ { - "driver": "raw_exec", - "meta": {}, - "name": "prod-task1-nodeexporter-arm64", - "volume_mounts": [] - }, - { - "driver": "exec", - "meta": {}, - "name": "prod-task2-blackboxexporter-arm64", - "volume_mounts": [] - }, - { "driver": "docker", "meta": {}, "name": "prod-task3-cadvisorexporter-arm64", @@ -257,11 +212,11 @@ "schema_version": 0, "attributes": { "filename": null, - "id": "080a6dd88248c6a78ec3025a0ec9e471aed9acde495ca47db868b0893b4446b7", - "rendered": "job \"prod-grafana\" {\n # The \"region\" parameter specifies the region in which to execute the job.\n # If omitted, this inherits the default region name of \"global\".\n # region = \"global\"\n #\n # The \"datacenters\" parameter specifies the list of datacenters which should\n # be considered when placing this task. This must be provided.\n datacenters = \"yul1\"\n\n # The \"type\" parameter controls the type of job, which impacts the scheduler's\n # decision on placement. This configuration is optional and defaults to\n # \"service\". For a full list of job types and their differences, please see\n # the online documentation.\n #\n # For more information, please see the online documentation at:\n #\n # https://www.nomadproject.io/docs/jobspec/schedulers\n #\n type = \"service\"\n\n update {\n # The \"max_parallel\" parameter specifies the maximum number of updates to\n # perform in parallel. In this case, this specifies to update a single task\n # at a time.\n max_parallel = 1\n\n health_check = \"checks\"\n\n # The \"min_healthy_time\" parameter specifies the minimum time the allocation\n # must be in the healthy state before it is marked as healthy and unblocks\n # further allocations from being updated.\n min_healthy_time = \"10s\"\n\n # The \"healthy_deadline\" parameter specifies the deadline in which the\n # allocation must be marked as healthy after which the allocation is\n # automatically transitioned to unhealthy. Transitioning to unhealthy will\n # fail the deployment and potentially roll back the job if \"auto_revert\" is\n # set to true.\n healthy_deadline = \"3m\"\n\n # The \"progress_deadline\" parameter specifies the deadline in which an\n # allocation must be marked as healthy. The deadline begins when the first\n # allocation for the deployment is created and is reset whenever an allocation\n # as part of the deployment transitions to a healthy state. If no allocation\n # transitions to the healthy state before the progress deadline, the\n # deployment is marked as failed.\n progress_deadline = \"10m\"\n\n\n # The \"canary\" parameter specifies that changes to the job that would result\n # in destructive updates should create the specified number of canaries\n # without stopping any previous allocations. Once the operator determines the\n # canaries are healthy, they can be promoted which unblocks a rolling update\n # of the remaining allocations at a rate of \"max_parallel\".\n #\n # Further, setting \"canary\" equal to the count of the task group allows\n # blue/green deployments. When the job is updated, a full set of the new\n # version is deployed and upon promotion the old version is stopped.\n canary = 1\n\n # Specifies if the job should auto-promote to the canary version when all\n # canaries become healthy during a deployment. Defaults to false which means\n # canaries must be manually updated with the nomad deployment promote\n # command.\n auto_promote = true\n\n # The \"auto_revert\" parameter specifies if the job should auto-revert to the\n # last stable job on deployment failure. A job is marked as stable if all the\n # allocations as part of its deployment were marked healthy.\n auto_revert = true\n\n }\n\n # The \"group\" stanza defines a series of tasks that should be co-located on\n # the same Nomad client. Any task within a group will be placed on the same\n # client.\n #\n # For more information and examples on the \"group\" stanza, please see\n # the online documentation at:\n #\n # https://www.nomadproject.io/docs/job-specification/group\n #\n group \"prod-group1-grafana\" {\n # The \"count\" parameter specifies the number of the task groups that should\n # be running under this group. This value must be non-negative and defaults\n # to 1.\n count = 1\n\n\n # The constraint allows restricting the set of eligible nodes. Constraints\n # may filter on attributes or client metadata.\n #\n # For more information and examples on the \"volume\" stanza, please see\n # the online documentation at:\n #\n # https://www.nomadproject.io/docs/job-specification/constraint\n #\n constraint {\n attribute = \"${attr.cpu.arch}\"\n operator = \"!=\"\n value = \"arm64\"\n }\n\n # The \"task\" stanza creates an individual unit of work, such as a Docker\n # container, web application, or batch processing.\n #\n # For more information and examples on the \"task\" stanza, please see\n # the online documentation at:\n #\n # https://www.nomadproject.io/docs/job-specification/task\n #\n task \"prod-task1-grafana\" {\n # The \"driver\" parameter specifies the task driver that should be used to\n # run the task.\n driver = \"docker\"\n\n # The \"config\" stanza specifies the driver configuration, which is passed\n # directly to the driver to start the task. The details of configurations\n # are specific to each driver, so please see specific driver\n # documentation for more information.\n config {\n image = \"grafana/grafana:7.3.7\"\n dns_servers = [ \"${attr.unique.network.ip-address}\" ]\n volumes = [\n \"secrets/prometheus.yml:/etc/grafana/provisioning/datasources/prometheus.yml\",\n \"secrets/dashboards.yml:/etc/grafana/provisioning/dashboards/dashboards.yml\",\n \"secrets/grafana.ini:/etc/grafana/grafana.ini\",\n \"secrets/node_exporter.json:/etc/grafana/provisioning/dashboards/node_exporter.json\",\n \"secrets/docker_cadvisor.json:/etc/grafana/provisioning/dashboards/docker_cadvisor.json\",\n \"secrets/nomad.json:/etc/grafana/provisioning/dashboards/nomad.json\",\n \"secrets/consul.json:/etc/grafana/provisioning/dashboards/consul.json\",\n \"secrets/prometheus.json:/etc/grafana/provisioning/dashboards/prometheus.json\",\n \"secrets/blackbox_exporter_http.json:/etc/grafana/provisioning/dashboards/blackbox_exporter_http.json\",\n \"secrets/blackbox_exporter_icmp.json:/etc/grafana/provisioning/dashboards/blackbox_exporter_icmp.json\"\n ]\n }\n\n artifact {\n # Prometheus Node Exporter\n source = \"https://raw.githubusercontent.com/pmikus/grafana-dashboards/main/node_exporter.json\"\n destination = \"secrets/\"\n }\n\n artifact {\n # Docker cAdvisor\n source = \"https://raw.githubusercontent.com/pmikus/grafana-dashboards/main/docker_cadvisor.json\"\n destination = \"secrets/\"\n }\n\n artifact {\n # Nomad\n source = \"https://raw.githubusercontent.com/pmikus/grafana-dashboards/main/nomad.json\"\n destination = \"secrets/\"\n }\n\n artifact {\n # Consul\n source = \"https://raw.githubusercontent.com/pmikus/grafana-dashboards/main/consul.json\"\n destination = \"secrets/\"\n }\n\n artifact {\n # Prometheus\n source = \"https://raw.githubusercontent.com/pmikus/grafana-dashboards/main/prometheus.json\"\n destination = \"secrets/\"\n }\n\n artifact {\n # Prometheus Blackbox Exporter ICMP\n source = \"https://raw.githubusercontent.com/pmikus/grafana-dashboards/main/blackbox_exporter_icmp.json\"\n destination = \"secrets/\"\n }\n\n artifact {\n # Prometheus Blackbox Exporter HTTP\n source = \"https://raw.githubusercontent.com/pmikus/grafana-dashboards/main/blackbox_exporter_http.json\"\n destination = \"secrets/\"\n }\n\n # The \"template\" stanza instructs Nomad to manage a template, such as\n # a configuration file or script. This template can optionally pull data\n # from Consul or Vault to populate runtime configuration data.\n #\n # For more information and examples on the \"template\" stanza, please see\n # the online documentation at:\n #\n # https://www.nomadproject.io/docs/job-specification/template\n #\n template {\n change_mode = \"noop\"\n change_signal = \"SIGINT\"\n destination = \"secrets/prometheus.yml\"\n data = \u003c\u003cEOH\napiVersion: 1\ndatasources:\n- name: Prometheus\n type: prometheus\n access: direct\n orgId: 1\n url: http://prometheus.service.consul:9090\n basicAuth: false\n isDefault: true\n version: 1\n editable: false\nEOH\n }\n\n template {\n change_mode = \"noop\"\n change_signal = \"SIGINT\"\n destination = \"secrets/dashboards.yml\"\n data = \u003c\u003cEOH\napiVersion: 1\nproviders:\n- name: dashboards\n type: file\n disableDeletion: false\n updateIntervalSeconds: 10\n allowUiUpdates: false\n options:\n path: /etc/grafana/provisioning/dashboards\n foldersFromFilesStructure: true\nEOH\n }\n\n template {\n change_mode = \"noop\"\n change_signal = \"SIGINT\"\n destination = \"secrets/grafana.ini\"\n data = \u003c\u003cEOH\napp_mode = production\n\n[metrics]\nenabled = true\n\n[server]\nprotocol = http\nhttp_port = 3000\nroot_url = http://grafana.service.consul:3000\nenable_gzip = true\n;cert_file =\n;cert_key =\n\n[security]\nadmin_user = grafanauser\nadmin_password = Grafana1234\nsecret_key = SW2YcwTIb9zpOOhoPsMm\n\n[users]\nallow_sign_up = false\nallow_org_create = false\nauto_assign_org = true\nauto_assign_org_role = Viewer\ndefault_theme = dark\n\n[auth.basic]\nenabled = true\n\n[auth]\ndisable_login_form = false\ndisable_signout_menu = false\n\n[auth.anonymous]\nenabled = false\n\n[log]\nmode = console\nlevel = info\n\n[log.console]\nlevel = info\nformat = console\nEOH\n }\n\n # The service stanza instructs Nomad to register a service with Consul.\n #\n # For more information and examples on the \"task\" stanza, please see\n # the online documentation at:\n #\n # https://www.nomadproject.io/docs/job-specification/service\n #\n service {\n name = \"grafana\"\n port = \"grafana\"\n tags = [ \"grafana${NOMAD_ALLOC_INDEX}\" ]\n check {\n name = \"Grafana Check Live\"\n type = \"http\"\n protocol = \"http\"\n tls_skip_verify = true\n path = \"/api/health\"\n interval = \"10s\"\n timeout = \"2s\"\n }\n }\n\n # The \"resources\" stanza describes the requirements a task needs to\n # execute. Resource requirements include memory, network, cpu, and more.\n # This ensures the task will execute on a machine that contains enough\n # resource capacity.\n #\n # For more information and examples on the \"resources\" stanza, please see\n # the online documentation at:\n #\n # https://www.nomadproject.io/docs/job-specification/resources\n #\n resources {\n cpu = 2000\n memory = 2048\n # The network stanza specifies the networking requirements for the task\n # group, including the network mode and port allocations. When scheduling\n # jobs in Nomad they are provisioned across your fleet of machines along\n # with other jobs and services. Because you don't know in advance what host\n # your job will be provisioned on, Nomad will provide your tasks with\n # network configuration when they start up.\n #\n # For more information and examples on the \"template\" stanza, please see\n # the online documentation at:\n #\n # https://www.nomadproject.io/docs/job-specification/network\n #\n network {\n port \"grafana\" {\n static = 3000\n }\n }\n }\n }\n }\n}", - "template": "job \"${job_name}\" {\n # The \"region\" parameter specifies the region in which to execute the job.\n # If omitted, this inherits the default region name of \"global\".\n # region = \"global\"\n #\n # The \"datacenters\" parameter specifies the list of datacenters which should\n # be considered when placing this task. This must be provided.\n datacenters = \"${datacenters}\"\n\n # The \"type\" parameter controls the type of job, which impacts the scheduler's\n # decision on placement. This configuration is optional and defaults to\n # \"service\". For a full list of job types and their differences, please see\n # the online documentation.\n #\n # For more information, please see the online documentation at:\n #\n # https://www.nomadproject.io/docs/jobspec/schedulers\n #\n type = \"service\"\n\n update {\n # The \"max_parallel\" parameter specifies the maximum number of updates to\n # perform in parallel. In this case, this specifies to update a single task\n # at a time.\n max_parallel = 1\n\n health_check = \"checks\"\n\n # The \"min_healthy_time\" parameter specifies the minimum time the allocation\n # must be in the healthy state before it is marked as healthy and unblocks\n # further allocations from being updated.\n min_healthy_time = \"10s\"\n\n # The \"healthy_deadline\" parameter specifies the deadline in which the\n # allocation must be marked as healthy after which the allocation is\n # automatically transitioned to unhealthy. Transitioning to unhealthy will\n # fail the deployment and potentially roll back the job if \"auto_revert\" is\n # set to true.\n healthy_deadline = \"3m\"\n\n # The \"progress_deadline\" parameter specifies the deadline in which an\n # allocation must be marked as healthy. The deadline begins when the first\n # allocation for the deployment is created and is reset whenever an allocation\n # as part of the deployment transitions to a healthy state. If no allocation\n # transitions to the healthy state before the progress deadline, the\n # deployment is marked as failed.\n progress_deadline = \"10m\"\n\n%{ if use_canary }\n # The \"canary\" parameter specifies that changes to the job that would result\n # in destructive updates should create the specified number of canaries\n # without stopping any previous allocations. Once the operator determines the\n # canaries are healthy, they can be promoted which unblocks a rolling update\n # of the remaining allocations at a rate of \"max_parallel\".\n #\n # Further, setting \"canary\" equal to the count of the task group allows\n # blue/green deployments. When the job is updated, a full set of the new\n # version is deployed and upon promotion the old version is stopped.\n canary = 1\n\n # Specifies if the job should auto-promote to the canary version when all\n # canaries become healthy during a deployment. Defaults to false which means\n # canaries must be manually updated with the nomad deployment promote\n # command.\n auto_promote = true\n\n # The \"auto_revert\" parameter specifies if the job should auto-revert to the\n # last stable job on deployment failure. A job is marked as stable if all the\n # allocations as part of its deployment were marked healthy.\n auto_revert = true\n%{ endif }\n }\n\n # The \"group\" stanza defines a series of tasks that should be co-located on\n # the same Nomad client. Any task within a group will be placed on the same\n # client.\n #\n # For more information and examples on the \"group\" stanza, please see\n # the online documentation at:\n #\n # https://www.nomadproject.io/docs/job-specification/group\n #\n group \"prod-group1-${service_name}\" {\n # The \"count\" parameter specifies the number of the task groups that should\n # be running under this group. This value must be non-negative and defaults\n # to 1.\n count = ${group_count}\n\n\n # The constraint allows restricting the set of eligible nodes. Constraints\n # may filter on attributes or client metadata.\n #\n # For more information and examples on the \"volume\" stanza, please see\n # the online documentation at:\n #\n # https://www.nomadproject.io/docs/job-specification/constraint\n #\n constraint {\n attribute = \"$${attr.cpu.arch}\"\n operator = \"!=\"\n value = \"arm64\"\n }\n\n # The \"task\" stanza creates an individual unit of work, such as a Docker\n # container, web application, or batch processing.\n #\n # For more information and examples on the \"task\" stanza, please see\n # the online documentation at:\n #\n # https://www.nomadproject.io/docs/job-specification/task\n #\n task \"prod-task1-${service_name}\" {\n # The \"driver\" parameter specifies the task driver that should be used to\n # run the task.\n driver = \"docker\"\n\n # The \"config\" stanza specifies the driver configuration, which is passed\n # directly to the driver to start the task. The details of configurations\n # are specific to each driver, so please see specific driver\n # documentation for more information.\n config {\n image = \"${image}\"\n dns_servers = [ \"$${attr.unique.network.ip-address}\" ]\n volumes = [\n \"secrets/prometheus.yml:/etc/grafana/provisioning/datasources/prometheus.yml\",\n \"secrets/dashboards.yml:/etc/grafana/provisioning/dashboards/dashboards.yml\",\n \"secrets/grafana.ini:/etc/grafana/grafana.ini\",\n \"secrets/node_exporter.json:/etc/grafana/provisioning/dashboards/node_exporter.json\",\n \"secrets/docker_cadvisor.json:/etc/grafana/provisioning/dashboards/docker_cadvisor.json\",\n \"secrets/nomad.json:/etc/grafana/provisioning/dashboards/nomad.json\",\n \"secrets/consul.json:/etc/grafana/provisioning/dashboards/consul.json\",\n \"secrets/prometheus.json:/etc/grafana/provisioning/dashboards/prometheus.json\",\n \"secrets/blackbox_exporter_http.json:/etc/grafana/provisioning/dashboards/blackbox_exporter_http.json\",\n \"secrets/blackbox_exporter_icmp.json:/etc/grafana/provisioning/dashboards/blackbox_exporter_icmp.json\"\n ]\n }\n\n artifact {\n # Prometheus Node Exporter\n source = \"https://raw.githubusercontent.com/pmikus/grafana-dashboards/main/node_exporter.json\"\n destination = \"secrets/\"\n }\n\n artifact {\n # Docker cAdvisor\n source = \"https://raw.githubusercontent.com/pmikus/grafana-dashboards/main/docker_cadvisor.json\"\n destination = \"secrets/\"\n }\n\n artifact {\n # Nomad\n source = \"https://raw.githubusercontent.com/pmikus/grafana-dashboards/main/nomad.json\"\n destination = \"secrets/\"\n }\n\n artifact {\n # Consul\n source = \"https://raw.githubusercontent.com/pmikus/grafana-dashboards/main/consul.json\"\n destination = \"secrets/\"\n }\n\n artifact {\n # Prometheus\n source = \"https://raw.githubusercontent.com/pmikus/grafana-dashboards/main/prometheus.json\"\n destination = \"secrets/\"\n }\n\n artifact {\n # Prometheus Blackbox Exporter ICMP\n source = \"https://raw.githubusercontent.com/pmikus/grafana-dashboards/main/blackbox_exporter_icmp.json\"\n destination = \"secrets/\"\n }\n\n artifact {\n # Prometheus Blackbox Exporter HTTP\n source = \"https://raw.githubusercontent.com/pmikus/grafana-dashboards/main/blackbox_exporter_http.json\"\n destination = \"secrets/\"\n }\n\n # The \"template\" stanza instructs Nomad to manage a template, such as\n # a configuration file or script. This template can optionally pull data\n # from Consul or Vault to populate runtime configuration data.\n #\n # For more information and examples on the \"template\" stanza, please see\n # the online documentation at:\n #\n # https://www.nomadproject.io/docs/job-specification/template\n #\n template {\n change_mode = \"noop\"\n change_signal = \"SIGINT\"\n destination = \"secrets/prometheus.yml\"\n data = \u003c\u003cEOH\napiVersion: 1\ndatasources:\n- name: Prometheus\n type: prometheus\n access: direct\n orgId: 1\n url: http://prometheus.service.consul:9090\n basicAuth: false\n isDefault: true\n version: 1\n editable: false\nEOH\n }\n\n template {\n change_mode = \"noop\"\n change_signal = \"SIGINT\"\n destination = \"secrets/dashboards.yml\"\n data = \u003c\u003cEOH\napiVersion: 1\nproviders:\n- name: dashboards\n type: file\n disableDeletion: false\n updateIntervalSeconds: 10\n allowUiUpdates: false\n options:\n path: /etc/grafana/provisioning/dashboards\n foldersFromFilesStructure: true\nEOH\n }\n\n template {\n change_mode = \"noop\"\n change_signal = \"SIGINT\"\n destination = \"secrets/grafana.ini\"\n data = \u003c\u003cEOH\napp_mode = production\n\n[metrics]\nenabled = true\n\n[server]\nprotocol = http\nhttp_port = ${port}\nroot_url = http://${service_name}.service.consul:${port}\nenable_gzip = true\n;cert_file =\n;cert_key =\n\n[security]\nadmin_user = grafanauser\nadmin_password = Grafana1234\nsecret_key = SW2YcwTIb9zpOOhoPsMm\n\n[users]\nallow_sign_up = false\nallow_org_create = false\nauto_assign_org = true\nauto_assign_org_role = Viewer\ndefault_theme = dark\n\n[auth.basic]\nenabled = true\n\n[auth]\ndisable_login_form = false\ndisable_signout_menu = false\n\n[auth.anonymous]\nenabled = false\n\n[log]\nmode = console\nlevel = info\n\n[log.console]\nlevel = info\nformat = console\nEOH\n }\n\n # The service stanza instructs Nomad to register a service with Consul.\n #\n # For more information and examples on the \"task\" stanza, please see\n # the online documentation at:\n #\n # https://www.nomadproject.io/docs/job-specification/service\n #\n service {\n name = \"${service_name}\"\n port = \"${service_name}\"\n tags = [ \"${service_name}$${NOMAD_ALLOC_INDEX}\" ]\n check {\n name = \"Grafana Check Live\"\n type = \"http\"\n protocol = \"http\"\n tls_skip_verify = true\n path = \"/api/health\"\n interval = \"10s\"\n timeout = \"2s\"\n }\n }\n\n # The \"resources\" stanza describes the requirements a task needs to\n # execute. Resource requirements include memory, network, cpu, and more.\n # This ensures the task will execute on a machine that contains enough\n # resource capacity.\n #\n # For more information and examples on the \"resources\" stanza, please see\n # the online documentation at:\n #\n # https://www.nomadproject.io/docs/job-specification/resources\n #\n resources {\n cpu = ${cpu}\n memory = ${mem}\n # The network stanza specifies the networking requirements for the task\n # group, including the network mode and port allocations. When scheduling\n # jobs in Nomad they are provisioned across your fleet of machines along\n # with other jobs and services. Because you don't know in advance what host\n # your job will be provisioned on, Nomad will provide your tasks with\n # network configuration when they start up.\n #\n # For more information and examples on the \"template\" stanza, please see\n # the online documentation at:\n #\n # https://www.nomadproject.io/docs/job-specification/network\n #\n network {\n port \"${service_name}\" {\n static = ${port}\n }\n }\n }\n }\n }\n}", + "id": "5c802f79894e52718b0d713abec8e46bf8b9831b7caa677ff22b657ea3254b42", + "rendered": "job \"prod-grafana\" {\n # The \"region\" parameter specifies the region in which to execute the job.\n # If omitted, this inherits the default region name of \"global\".\n # region = \"global\"\n #\n # The \"datacenters\" parameter specifies the list of datacenters which should\n # be considered when placing this task. This must be provided.\n datacenters = \"yul1\"\n\n # The \"type\" parameter controls the type of job, which impacts the scheduler's\n # decision on placement. This configuration is optional and defaults to\n # \"service\". For a full list of job types and their differences, please see\n # the online documentation.\n #\n # For more information, please see the online documentation at:\n #\n # https://www.nomadproject.io/docs/jobspec/schedulers\n #\n type = \"service\"\n\n update {\n # The \"max_parallel\" parameter specifies the maximum number of updates to\n # perform in parallel. In this case, this specifies to update a single task\n # at a time.\n max_parallel = 1\n\n health_check = \"checks\"\n\n # The \"min_healthy_time\" parameter specifies the minimum time the allocation\n # must be in the healthy state before it is marked as healthy and unblocks\n # further allocations from being updated.\n min_healthy_time = \"10s\"\n\n # The \"healthy_deadline\" parameter specifies the deadline in which the\n # allocation must be marked as healthy after which the allocation is\n # automatically transitioned to unhealthy. Transitioning to unhealthy will\n # fail the deployment and potentially roll back the job if \"auto_revert\" is\n # set to true.\n healthy_deadline = \"3m\"\n\n # The \"progress_deadline\" parameter specifies the deadline in which an\n # allocation must be marked as healthy. The deadline begins when the first\n # allocation for the deployment is created and is reset whenever an allocation\n # as part of the deployment transitions to a healthy state. If no allocation\n # transitions to the healthy state before the progress deadline, the\n # deployment is marked as failed.\n progress_deadline = \"10m\"\n\n\n # The \"canary\" parameter specifies that changes to the job that would result\n # in destructive updates should create the specified number of canaries\n # without stopping any previous allocations. Once the operator determines the\n # canaries are healthy, they can be promoted which unblocks a rolling update\n # of the remaining allocations at a rate of \"max_parallel\".\n #\n # Further, setting \"canary\" equal to the count of the task group allows\n # blue/green deployments. When the job is updated, a full set of the new\n # version is deployed and upon promotion the old version is stopped.\n canary = 1\n\n # Specifies if the job should auto-promote to the canary version when all\n # canaries become healthy during a deployment. Defaults to false which means\n # canaries must be manually updated with the nomad deployment promote\n # command.\n auto_promote = true\n\n # The \"auto_revert\" parameter specifies if the job should auto-revert to the\n # last stable job on deployment failure. A job is marked as stable if all the\n # allocations as part of its deployment were marked healthy.\n auto_revert = true\n\n }\n\n # The \"group\" stanza defines a series of tasks that should be co-located on\n # the same Nomad client. Any task within a group will be placed on the same\n # client.\n #\n # For more information and examples on the \"group\" stanza, please see\n # the online documentation at:\n #\n # https://www.nomadproject.io/docs/job-specification/group\n #\n group \"prod-group1-grafana\" {\n # The \"count\" parameter specifies the number of the task groups that should\n # be running under this group. This value must be non-negative and defaults\n # to 1.\n count = 1\n\n\n # The constraint allows restricting the set of eligible nodes. Constraints\n # may filter on attributes or client metadata.\n #\n # For more information and examples on the \"volume\" stanza, please see\n # the online documentation at:\n #\n # https://www.nomadproject.io/docs/job-specification/constraint\n #\n constraint {\n attribute = \"${attr.cpu.arch}\"\n operator = \"!=\"\n value = \"arm64\"\n }\n\n # The \"task\" stanza creates an individual unit of work, such as a Docker\n # container, web application, or batch processing.\n #\n # For more information and examples on the \"task\" stanza, please see\n # the online documentation at:\n #\n # https://www.nomadproject.io/docs/job-specification/task\n #\n task \"prod-task1-grafana\" {\n # The \"driver\" parameter specifies the task driver that should be used to\n # run the task.\n driver = \"docker\"\n\n # The \"config\" stanza specifies the driver configuration, which is passed\n # directly to the driver to start the task. The details of configurations\n # are specific to each driver, so please see specific driver\n # documentation for more information.\n config {\n image = \"grafana/grafana:7.3.7\"\n dns_servers = [ \"${attr.unique.network.ip-address}\" ]\n volumes = [\n \"secrets/prometheus.yml:/etc/grafana/provisioning/datasources/prometheus.yml\",\n \"secrets/dashboards.yml:/etc/grafana/provisioning/dashboards/dashboards.yml\",\n \"secrets/grafana.ini:/etc/grafana/grafana.ini\",\n \"secrets/node_exporter.json:/etc/grafana/provisioning/dashboards/node_exporter.json\",\n \"secrets/docker_cadvisor.json:/etc/grafana/provisioning/dashboards/docker_cadvisor.json\",\n \"secrets/nomad.json:/etc/grafana/provisioning/dashboards/nomad.json\",\n \"secrets/consul.json:/etc/grafana/provisioning/dashboards/consul.json\",\n \"secrets/prometheus.json:/etc/grafana/provisioning/dashboards/prometheus.json\",\n \"secrets/blackbox_exporter_http.json:/etc/grafana/provisioning/dashboards/blackbox_exporter_http.json\",\n \"secrets/blackbox_exporter_icmp.json:/etc/grafana/provisioning/dashboards/blackbox_exporter_icmp.json\"\n ]\n }\n\n artifact {\n # Prometheus Node Exporter\n source = \"https://raw.githubusercontent.com/pmikus/grafana-dashboards/main/node_exporter.json\"\n destination = \"secrets/\"\n }\n\n artifact {\n # Docker cAdvisor\n source = \"https://raw.githubusercontent.com/pmikus/grafana-dashboards/main/docker_cadvisor.json\"\n destination = \"secrets/\"\n }\n\n artifact {\n # Nomad\n source = \"https://raw.githubusercontent.com/pmikus/grafana-dashboards/main/nomad.json\"\n destination = \"secrets/\"\n }\n\n artifact {\n # Consul\n source = \"https://raw.githubusercontent.com/pmikus/grafana-dashboards/main/consul.json\"\n destination = \"secrets/\"\n }\n\n artifact {\n # Prometheus\n source = \"https://raw.githubusercontent.com/pmikus/grafana-dashboards/main/prometheus.json\"\n destination = \"secrets/\"\n }\n\n artifact {\n # Prometheus Blackbox Exporter HTTP\n source = \"https://raw.githubusercontent.com/pmikus/grafana-dashboards/main/blackbox_exporter_http.json\"\n destination = \"secrets/\"\n }\n\n artifact {\n # Prometheus Blackbox Exporter ICMP\n source = \"https://raw.githubusercontent.com/pmikus/grafana-dashboards/main/blackbox_exporter_icmp.json\"\n destination = \"secrets/\"\n }\n\n # The \"template\" stanza instructs Nomad to manage a template, such as\n # a configuration file or script. This template can optionally pull data\n # from Consul or Vault to populate runtime configuration data.\n #\n # For more information and examples on the \"template\" stanza, please see\n # the online documentation at:\n #\n # https://www.nomadproject.io/docs/job-specification/template\n #\n template {\n change_mode = \"noop\"\n change_signal = \"SIGINT\"\n destination = \"secrets/prometheus.yml\"\n data = \u003c\u003cEOH\napiVersion: 1\ndatasources:\n- name: Prometheus\n type: prometheus\n access: direct\n orgId: 1\n url: http://prometheus.service.consul:9090\n basicAuth: false\n isDefault: true\n version: 1\n editable: false\nEOH\n }\n\n template {\n change_mode = \"noop\"\n change_signal = \"SIGINT\"\n destination = \"secrets/dashboards.yml\"\n data = \u003c\u003cEOH\napiVersion: 1\nproviders:\n- name: dashboards\n type: file\n disableDeletion: false\n updateIntervalSeconds: 10\n allowUiUpdates: false\n options:\n path: /etc/grafana/provisioning/dashboards\n foldersFromFilesStructure: true\nEOH\n }\n\n template {\n change_mode = \"noop\"\n change_signal = \"SIGINT\"\n destination = \"secrets/grafana.ini\"\n data = \u003c\u003cEOH\napp_mode = production\n\n[metrics]\nenabled = true\n\n[server]\nprotocol = http\nhttp_port = 3000\nroot_url = http://grafana.service.consul:3000\nenable_gzip = true\n;cert_file =\n;cert_key =\n\n[security]\nadmin_user = grafanauser\nadmin_password = Grafana1234\nsecret_key = SW2YcwTIb9zpOOhoPsMm\n\n[users]\nallow_sign_up = false\nallow_org_create = false\nauto_assign_org = true\nauto_assign_org_role = Viewer\ndefault_theme = dark\n\n[auth.basic]\nenabled = true\n\n[auth]\ndisable_login_form = false\ndisable_signout_menu = false\n\n[auth.anonymous]\nenabled = false\n\n[log]\nmode = console\nlevel = info\n\n[log.console]\nlevel = info\nformat = console\nEOH\n }\n\n # The service stanza instructs Nomad to register a service with Consul.\n #\n # For more information and examples on the \"task\" stanza, please see\n # the online documentation at:\n #\n # https://www.nomadproject.io/docs/job-specification/service\n #\n service {\n name = \"grafana\"\n port = \"grafana\"\n tags = [ \"grafana${NOMAD_ALLOC_INDEX}\" ]\n check {\n name = \"Grafana Check Live\"\n type = \"http\"\n protocol = \"http\"\n tls_skip_verify = true\n path = \"/api/health\"\n interval = \"10s\"\n timeout = \"2s\"\n }\n }\n\n # The \"resources\" stanza describes the requirements a task needs to\n # execute. Resource requirements include memory, network, cpu, and more.\n # This ensures the task will execute on a machine that contains enough\n # resource capacity.\n #\n # For more information and examples on the \"resources\" stanza, please see\n # the online documentation at:\n #\n # https://www.nomadproject.io/docs/job-specification/resources\n #\n resources {\n cpu = 1000\n memory = 2048\n # The network stanza specifies the networking requirements for the task\n # group, including the network mode and port allocations. When scheduling\n # jobs in Nomad they are provisioned across your fleet of machines along\n # with other jobs and services. Because you don't know in advance what host\n # your job will be provisioned on, Nomad will provide your tasks with\n # network configuration when they start up.\n #\n # For more information and examples on the \"template\" stanza, please see\n # the online documentation at:\n #\n # https://www.nomadproject.io/docs/job-specification/network\n #\n network {\n port \"grafana\" {\n static = 3000\n }\n }\n }\n }\n }\n}", + "template": "job \"${job_name}\" {\n # The \"region\" parameter specifies the region in which to execute the job.\n # If omitted, this inherits the default region name of \"global\".\n # region = \"global\"\n #\n # The \"datacenters\" parameter specifies the list of datacenters which should\n # be considered when placing this task. This must be provided.\n datacenters = \"${datacenters}\"\n\n # The \"type\" parameter controls the type of job, which impacts the scheduler's\n # decision on placement. This configuration is optional and defaults to\n # \"service\". For a full list of job types and their differences, please see\n # the online documentation.\n #\n # For more information, please see the online documentation at:\n #\n # https://www.nomadproject.io/docs/jobspec/schedulers\n #\n type = \"service\"\n\n update {\n # The \"max_parallel\" parameter specifies the maximum number of updates to\n # perform in parallel. In this case, this specifies to update a single task\n # at a time.\n max_parallel = 1\n\n health_check = \"checks\"\n\n # The \"min_healthy_time\" parameter specifies the minimum time the allocation\n # must be in the healthy state before it is marked as healthy and unblocks\n # further allocations from being updated.\n min_healthy_time = \"10s\"\n\n # The \"healthy_deadline\" parameter specifies the deadline in which the\n # allocation must be marked as healthy after which the allocation is\n # automatically transitioned to unhealthy. Transitioning to unhealthy will\n # fail the deployment and potentially roll back the job if \"auto_revert\" is\n # set to true.\n healthy_deadline = \"3m\"\n\n # The \"progress_deadline\" parameter specifies the deadline in which an\n # allocation must be marked as healthy. The deadline begins when the first\n # allocation for the deployment is created and is reset whenever an allocation\n # as part of the deployment transitions to a healthy state. If no allocation\n # transitions to the healthy state before the progress deadline, the\n # deployment is marked as failed.\n progress_deadline = \"10m\"\n\n%{ if use_canary }\n # The \"canary\" parameter specifies that changes to the job that would result\n # in destructive updates should create the specified number of canaries\n # without stopping any previous allocations. Once the operator determines the\n # canaries are healthy, they can be promoted which unblocks a rolling update\n # of the remaining allocations at a rate of \"max_parallel\".\n #\n # Further, setting \"canary\" equal to the count of the task group allows\n # blue/green deployments. When the job is updated, a full set of the new\n # version is deployed and upon promotion the old version is stopped.\n canary = 1\n\n # Specifies if the job should auto-promote to the canary version when all\n # canaries become healthy during a deployment. Defaults to false which means\n # canaries must be manually updated with the nomad deployment promote\n # command.\n auto_promote = true\n\n # The \"auto_revert\" parameter specifies if the job should auto-revert to the\n # last stable job on deployment failure. A job is marked as stable if all the\n # allocations as part of its deployment were marked healthy.\n auto_revert = true\n%{ endif }\n }\n\n # The \"group\" stanza defines a series of tasks that should be co-located on\n # the same Nomad client. Any task within a group will be placed on the same\n # client.\n #\n # For more information and examples on the \"group\" stanza, please see\n # the online documentation at:\n #\n # https://www.nomadproject.io/docs/job-specification/group\n #\n group \"prod-group1-${service_name}\" {\n # The \"count\" parameter specifies the number of the task groups that should\n # be running under this group. This value must be non-negative and defaults\n # to 1.\n count = ${group_count}\n\n\n # The constraint allows restricting the set of eligible nodes. Constraints\n # may filter on attributes or client metadata.\n #\n # For more information and examples on the \"volume\" stanza, please see\n # the online documentation at:\n #\n # https://www.nomadproject.io/docs/job-specification/constraint\n #\n constraint {\n attribute = \"$${attr.cpu.arch}\"\n operator = \"!=\"\n value = \"arm64\"\n }\n\n # The \"task\" stanza creates an individual unit of work, such as a Docker\n # container, web application, or batch processing.\n #\n # For more information and examples on the \"task\" stanza, please see\n # the online documentation at:\n #\n # https://www.nomadproject.io/docs/job-specification/task\n #\n task \"prod-task1-${service_name}\" {\n # The \"driver\" parameter specifies the task driver that should be used to\n # run the task.\n driver = \"docker\"\n\n # The \"config\" stanza specifies the driver configuration, which is passed\n # directly to the driver to start the task. The details of configurations\n # are specific to each driver, so please see specific driver\n # documentation for more information.\n config {\n image = \"${image}\"\n dns_servers = [ \"$${attr.unique.network.ip-address}\" ]\n volumes = [\n \"secrets/prometheus.yml:/etc/grafana/provisioning/datasources/prometheus.yml\",\n \"secrets/dashboards.yml:/etc/grafana/provisioning/dashboards/dashboards.yml\",\n \"secrets/grafana.ini:/etc/grafana/grafana.ini\",\n \"secrets/node_exporter.json:/etc/grafana/provisioning/dashboards/node_exporter.json\",\n \"secrets/docker_cadvisor.json:/etc/grafana/provisioning/dashboards/docker_cadvisor.json\",\n \"secrets/nomad.json:/etc/grafana/provisioning/dashboards/nomad.json\",\n \"secrets/consul.json:/etc/grafana/provisioning/dashboards/consul.json\",\n \"secrets/prometheus.json:/etc/grafana/provisioning/dashboards/prometheus.json\",\n \"secrets/blackbox_exporter_http.json:/etc/grafana/provisioning/dashboards/blackbox_exporter_http.json\",\n \"secrets/blackbox_exporter_icmp.json:/etc/grafana/provisioning/dashboards/blackbox_exporter_icmp.json\"\n ]\n }\n\n artifact {\n # Prometheus Node Exporter\n source = \"https://raw.githubusercontent.com/pmikus/grafana-dashboards/main/node_exporter.json\"\n destination = \"secrets/\"\n }\n\n artifact {\n # Docker cAdvisor\n source = \"https://raw.githubusercontent.com/pmikus/grafana-dashboards/main/docker_cadvisor.json\"\n destination = \"secrets/\"\n }\n\n artifact {\n # Nomad\n source = \"https://raw.githubusercontent.com/pmikus/grafana-dashboards/main/nomad.json\"\n destination = \"secrets/\"\n }\n\n artifact {\n # Consul\n source = \"https://raw.githubusercontent.com/pmikus/grafana-dashboards/main/consul.json\"\n destination = \"secrets/\"\n }\n\n artifact {\n # Prometheus\n source = \"https://raw.githubusercontent.com/pmikus/grafana-dashboards/main/prometheus.json\"\n destination = \"secrets/\"\n }\n\n artifact {\n # Prometheus Blackbox Exporter HTTP\n source = \"https://raw.githubusercontent.com/pmikus/grafana-dashboards/main/blackbox_exporter_http.json\"\n destination = \"secrets/\"\n }\n\n artifact {\n # Prometheus Blackbox Exporter ICMP\n source = \"https://raw.githubusercontent.com/pmikus/grafana-dashboards/main/blackbox_exporter_icmp.json\"\n destination = \"secrets/\"\n }\n\n # The \"template\" stanza instructs Nomad to manage a template, such as\n # a configuration file or script. This template can optionally pull data\n # from Consul or Vault to populate runtime configuration data.\n #\n # For more information and examples on the \"template\" stanza, please see\n # the online documentation at:\n #\n # https://www.nomadproject.io/docs/job-specification/template\n #\n template {\n change_mode = \"noop\"\n change_signal = \"SIGINT\"\n destination = \"secrets/prometheus.yml\"\n data = \u003c\u003cEOH\napiVersion: 1\ndatasources:\n- name: Prometheus\n type: prometheus\n access: direct\n orgId: 1\n url: http://prometheus.service.consul:9090\n basicAuth: false\n isDefault: true\n version: 1\n editable: false\nEOH\n }\n\n template {\n change_mode = \"noop\"\n change_signal = \"SIGINT\"\n destination = \"secrets/dashboards.yml\"\n data = \u003c\u003cEOH\napiVersion: 1\nproviders:\n- name: dashboards\n type: file\n disableDeletion: false\n updateIntervalSeconds: 10\n allowUiUpdates: false\n options:\n path: /etc/grafana/provisioning/dashboards\n foldersFromFilesStructure: true\nEOH\n }\n\n template {\n change_mode = \"noop\"\n change_signal = \"SIGINT\"\n destination = \"secrets/grafana.ini\"\n data = \u003c\u003cEOH\napp_mode = production\n\n[metrics]\nenabled = true\n\n[server]\nprotocol = http\nhttp_port = ${port}\nroot_url = http://${service_name}.service.consul:${port}\nenable_gzip = true\n;cert_file =\n;cert_key =\n\n[security]\nadmin_user = grafanauser\nadmin_password = Grafana1234\nsecret_key = SW2YcwTIb9zpOOhoPsMm\n\n[users]\nallow_sign_up = false\nallow_org_create = false\nauto_assign_org = true\nauto_assign_org_role = Viewer\ndefault_theme = dark\n\n[auth.basic]\nenabled = true\n\n[auth]\ndisable_login_form = false\ndisable_signout_menu = false\n\n[auth.anonymous]\nenabled = false\n\n[log]\nmode = console\nlevel = info\n\n[log.console]\nlevel = info\nformat = console\nEOH\n }\n\n # The service stanza instructs Nomad to register a service with Consul.\n #\n # For more information and examples on the \"task\" stanza, please see\n # the online documentation at:\n #\n # https://www.nomadproject.io/docs/job-specification/service\n #\n service {\n name = \"${service_name}\"\n port = \"${service_name}\"\n tags = [ \"${service_name}$${NOMAD_ALLOC_INDEX}\" ]\n check {\n name = \"Grafana Check Live\"\n type = \"http\"\n protocol = \"http\"\n tls_skip_verify = true\n path = \"/api/health\"\n interval = \"10s\"\n timeout = \"2s\"\n }\n }\n\n # The \"resources\" stanza describes the requirements a task needs to\n # execute. Resource requirements include memory, network, cpu, and more.\n # This ensures the task will execute on a machine that contains enough\n # resource capacity.\n #\n # For more information and examples on the \"resources\" stanza, please see\n # the online documentation at:\n #\n # https://www.nomadproject.io/docs/job-specification/resources\n #\n resources {\n cpu = ${cpu}\n memory = ${mem}\n # The network stanza specifies the networking requirements for the task\n # group, including the network mode and port allocations. When scheduling\n # jobs in Nomad they are provisioned across your fleet of machines along\n # with other jobs and services. Because you don't know in advance what host\n # your job will be provisioned on, Nomad will provide your tasks with\n # network configuration when they start up.\n #\n # For more information and examples on the \"template\" stanza, please see\n # the online documentation at:\n #\n # https://www.nomadproject.io/docs/job-specification/network\n #\n network {\n port \"${service_name}\" {\n static = ${port}\n }\n }\n }\n }\n }\n}", "vars": { - "cpu": "2000", + "cpu": "1000", "datacenters": "yul1", "group_count": "1", "image": "grafana/grafana:7.3.7", @@ -288,21 +243,20 @@ "schema_version": 0, "attributes": { "allocation_ids": [ - "1cf5f45a-3d10-b433-600e-f92696e49a25", - "d2ddca98-1bb8-680d-06f1-00fcf4887eca" + "1ddd68d2-ab33-a727-c667-1713435b506b" ], "datacenters": [ "yul1" ], - "deployment_id": "fef8bdc5-f769-c504-f089-8886755e45ef", + "deployment_id": "4b6abbd0-0129-03d7-b8e3-e7c256c5bcb6", "deployment_status": "successful", "deregister_on_destroy": true, "deregister_on_id_change": true, "detach": false, "id": "prod-grafana", - "jobspec": "job \"prod-grafana\" {\n # The \"region\" parameter specifies the region in which to execute the job.\n # If omitted, this inherits the default region name of \"global\".\n # region = \"global\"\n #\n # The \"datacenters\" parameter specifies the list of datacenters which should\n # be considered when placing this task. This must be provided.\n datacenters = \"yul1\"\n\n # The \"type\" parameter controls the type of job, which impacts the scheduler's\n # decision on placement. This configuration is optional and defaults to\n # \"service\". For a full list of job types and their differences, please see\n # the online documentation.\n #\n # For more information, please see the online documentation at:\n #\n # https://www.nomadproject.io/docs/jobspec/schedulers\n #\n type = \"service\"\n\n update {\n # The \"max_parallel\" parameter specifies the maximum number of updates to\n # perform in parallel. In this case, this specifies to update a single task\n # at a time.\n max_parallel = 1\n\n health_check = \"checks\"\n\n # The \"min_healthy_time\" parameter specifies the minimum time the allocation\n # must be in the healthy state before it is marked as healthy and unblocks\n # further allocations from being updated.\n min_healthy_time = \"10s\"\n\n # The \"healthy_deadline\" parameter specifies the deadline in which the\n # allocation must be marked as healthy after which the allocation is\n # automatically transitioned to unhealthy. Transitioning to unhealthy will\n # fail the deployment and potentially roll back the job if \"auto_revert\" is\n # set to true.\n healthy_deadline = \"3m\"\n\n # The \"progress_deadline\" parameter specifies the deadline in which an\n # allocation must be marked as healthy. The deadline begins when the first\n # allocation for the deployment is created and is reset whenever an allocation\n # as part of the deployment transitions to a healthy state. If no allocation\n # transitions to the healthy state before the progress deadline, the\n # deployment is marked as failed.\n progress_deadline = \"10m\"\n\n\n # The \"canary\" parameter specifies that changes to the job that would result\n # in destructive updates should create the specified number of canaries\n # without stopping any previous allocations. Once the operator determines the\n # canaries are healthy, they can be promoted which unblocks a rolling update\n # of the remaining allocations at a rate of \"max_parallel\".\n #\n # Further, setting \"canary\" equal to the count of the task group allows\n # blue/green deployments. When the job is updated, a full set of the new\n # version is deployed and upon promotion the old version is stopped.\n canary = 1\n\n # Specifies if the job should auto-promote to the canary version when all\n # canaries become healthy during a deployment. Defaults to false which means\n # canaries must be manually updated with the nomad deployment promote\n # command.\n auto_promote = true\n\n # The \"auto_revert\" parameter specifies if the job should auto-revert to the\n # last stable job on deployment failure. A job is marked as stable if all the\n # allocations as part of its deployment were marked healthy.\n auto_revert = true\n\n }\n\n # The \"group\" stanza defines a series of tasks that should be co-located on\n # the same Nomad client. Any task within a group will be placed on the same\n # client.\n #\n # For more information and examples on the \"group\" stanza, please see\n # the online documentation at:\n #\n # https://www.nomadproject.io/docs/job-specification/group\n #\n group \"prod-group1-grafana\" {\n # The \"count\" parameter specifies the number of the task groups that should\n # be running under this group. This value must be non-negative and defaults\n # to 1.\n count = 1\n\n\n # The constraint allows restricting the set of eligible nodes. Constraints\n # may filter on attributes or client metadata.\n #\n # For more information and examples on the \"volume\" stanza, please see\n # the online documentation at:\n #\n # https://www.nomadproject.io/docs/job-specification/constraint\n #\n constraint {\n attribute = \"${attr.cpu.arch}\"\n operator = \"!=\"\n value = \"arm64\"\n }\n\n # The \"task\" stanza creates an individual unit of work, such as a Docker\n # container, web application, or batch processing.\n #\n # For more information and examples on the \"task\" stanza, please see\n # the online documentation at:\n #\n # https://www.nomadproject.io/docs/job-specification/task\n #\n task \"prod-task1-grafana\" {\n # The \"driver\" parameter specifies the task driver that should be used to\n # run the task.\n driver = \"docker\"\n\n # The \"config\" stanza specifies the driver configuration, which is passed\n # directly to the driver to start the task. The details of configurations\n # are specific to each driver, so please see specific driver\n # documentation for more information.\n config {\n image = \"grafana/grafana:7.3.7\"\n dns_servers = [ \"${attr.unique.network.ip-address}\" ]\n volumes = [\n \"secrets/prometheus.yml:/etc/grafana/provisioning/datasources/prometheus.yml\",\n \"secrets/dashboards.yml:/etc/grafana/provisioning/dashboards/dashboards.yml\",\n \"secrets/grafana.ini:/etc/grafana/grafana.ini\",\n \"secrets/node_exporter.json:/etc/grafana/provisioning/dashboards/node_exporter.json\",\n \"secrets/docker_cadvisor.json:/etc/grafana/provisioning/dashboards/docker_cadvisor.json\",\n \"secrets/nomad.json:/etc/grafana/provisioning/dashboards/nomad.json\",\n \"secrets/consul.json:/etc/grafana/provisioning/dashboards/consul.json\",\n \"secrets/prometheus.json:/etc/grafana/provisioning/dashboards/prometheus.json\",\n \"secrets/blackbox_exporter_http.json:/etc/grafana/provisioning/dashboards/blackbox_exporter_http.json\",\n \"secrets/blackbox_exporter_icmp.json:/etc/grafana/provisioning/dashboards/blackbox_exporter_icmp.json\"\n ]\n }\n\n artifact {\n # Prometheus Node Exporter\n source = \"https://raw.githubusercontent.com/pmikus/grafana-dashboards/main/node_exporter.json\"\n destination = \"secrets/\"\n }\n\n artifact {\n # Docker cAdvisor\n source = \"https://raw.githubusercontent.com/pmikus/grafana-dashboards/main/docker_cadvisor.json\"\n destination = \"secrets/\"\n }\n\n artifact {\n # Nomad\n source = \"https://raw.githubusercontent.com/pmikus/grafana-dashboards/main/nomad.json\"\n destination = \"secrets/\"\n }\n\n artifact {\n # Consul\n source = \"https://raw.githubusercontent.com/pmikus/grafana-dashboards/main/consul.json\"\n destination = \"secrets/\"\n }\n\n artifact {\n # Prometheus\n source = \"https://raw.githubusercontent.com/pmikus/grafana-dashboards/main/prometheus.json\"\n destination = \"secrets/\"\n }\n\n artifact {\n # Prometheus Blackbox Exporter ICMP\n source = \"https://raw.githubusercontent.com/pmikus/grafana-dashboards/main/blackbox_exporter_icmp.json\"\n destination = \"secrets/\"\n }\n\n artifact {\n # Prometheus Blackbox Exporter HTTP\n source = \"https://raw.githubusercontent.com/pmikus/grafana-dashboards/main/blackbox_exporter_http.json\"\n destination = \"secrets/\"\n }\n\n # The \"template\" stanza instructs Nomad to manage a template, such as\n # a configuration file or script. This template can optionally pull data\n # from Consul or Vault to populate runtime configuration data.\n #\n # For more information and examples on the \"template\" stanza, please see\n # the online documentation at:\n #\n # https://www.nomadproject.io/docs/job-specification/template\n #\n template {\n change_mode = \"noop\"\n change_signal = \"SIGINT\"\n destination = \"secrets/prometheus.yml\"\n data = \u003c\u003cEOH\napiVersion: 1\ndatasources:\n- name: Prometheus\n type: prometheus\n access: direct\n orgId: 1\n url: http://prometheus.service.consul:9090\n basicAuth: false\n isDefault: true\n version: 1\n editable: false\nEOH\n }\n\n template {\n change_mode = \"noop\"\n change_signal = \"SIGINT\"\n destination = \"secrets/dashboards.yml\"\n data = \u003c\u003cEOH\napiVersion: 1\nproviders:\n- name: dashboards\n type: file\n disableDeletion: false\n updateIntervalSeconds: 10\n allowUiUpdates: false\n options:\n path: /etc/grafana/provisioning/dashboards\n foldersFromFilesStructure: true\nEOH\n }\n\n template {\n change_mode = \"noop\"\n change_signal = \"SIGINT\"\n destination = \"secrets/grafana.ini\"\n data = \u003c\u003cEOH\napp_mode = production\n\n[metrics]\nenabled = true\n\n[server]\nprotocol = http\nhttp_port = 3000\nroot_url = http://grafana.service.consul:3000\nenable_gzip = true\n;cert_file =\n;cert_key =\n\n[security]\nadmin_user = grafanauser\nadmin_password = Grafana1234\nsecret_key = SW2YcwTIb9zpOOhoPsMm\n\n[users]\nallow_sign_up = false\nallow_org_create = false\nauto_assign_org = true\nauto_assign_org_role = Viewer\ndefault_theme = dark\n\n[auth.basic]\nenabled = true\n\n[auth]\ndisable_login_form = false\ndisable_signout_menu = false\n\n[auth.anonymous]\nenabled = false\n\n[log]\nmode = console\nlevel = info\n\n[log.console]\nlevel = info\nformat = console\nEOH\n }\n\n # The service stanza instructs Nomad to register a service with Consul.\n #\n # For more information and examples on the \"task\" stanza, please see\n # the online documentation at:\n #\n # https://www.nomadproject.io/docs/job-specification/service\n #\n service {\n name = \"grafana\"\n port = \"grafana\"\n tags = [ \"grafana${NOMAD_ALLOC_INDEX}\" ]\n check {\n name = \"Grafana Check Live\"\n type = \"http\"\n protocol = \"http\"\n tls_skip_verify = true\n path = \"/api/health\"\n interval = \"10s\"\n timeout = \"2s\"\n }\n }\n\n # The \"resources\" stanza describes the requirements a task needs to\n # execute. Resource requirements include memory, network, cpu, and more.\n # This ensures the task will execute on a machine that contains enough\n # resource capacity.\n #\n # For more information and examples on the \"resources\" stanza, please see\n # the online documentation at:\n #\n # https://www.nomadproject.io/docs/job-specification/resources\n #\n resources {\n cpu = 2000\n memory = 2048\n # The network stanza specifies the networking requirements for the task\n # group, including the network mode and port allocations. When scheduling\n # jobs in Nomad they are provisioned across your fleet of machines along\n # with other jobs and services. Because you don't know in advance what host\n # your job will be provisioned on, Nomad will provide your tasks with\n # network configuration when they start up.\n #\n # For more information and examples on the \"template\" stanza, please see\n # the online documentation at:\n #\n # https://www.nomadproject.io/docs/job-specification/network\n #\n network {\n port \"grafana\" {\n static = 3000\n }\n }\n }\n }\n }\n}", + "jobspec": "job \"prod-grafana\" {\n # The \"region\" parameter specifies the region in which to execute the job.\n # If omitted, this inherits the default region name of \"global\".\n # region = \"global\"\n #\n # The \"datacenters\" parameter specifies the list of datacenters which should\n # be considered when placing this task. This must be provided.\n datacenters = \"yul1\"\n\n # The \"type\" parameter controls the type of job, which impacts the scheduler's\n # decision on placement. This configuration is optional and defaults to\n # \"service\". For a full list of job types and their differences, please see\n # the online documentation.\n #\n # For more information, please see the online documentation at:\n #\n # https://www.nomadproject.io/docs/jobspec/schedulers\n #\n type = \"service\"\n\n update {\n # The \"max_parallel\" parameter specifies the maximum number of updates to\n # perform in parallel. In this case, this specifies to update a single task\n # at a time.\n max_parallel = 1\n\n health_check = \"checks\"\n\n # The \"min_healthy_time\" parameter specifies the minimum time the allocation\n # must be in the healthy state before it is marked as healthy and unblocks\n # further allocations from being updated.\n min_healthy_time = \"10s\"\n\n # The \"healthy_deadline\" parameter specifies the deadline in which the\n # allocation must be marked as healthy after which the allocation is\n # automatically transitioned to unhealthy. Transitioning to unhealthy will\n # fail the deployment and potentially roll back the job if \"auto_revert\" is\n # set to true.\n healthy_deadline = \"3m\"\n\n # The \"progress_deadline\" parameter specifies the deadline in which an\n # allocation must be marked as healthy. The deadline begins when the first\n # allocation for the deployment is created and is reset whenever an allocation\n # as part of the deployment transitions to a healthy state. If no allocation\n # transitions to the healthy state before the progress deadline, the\n # deployment is marked as failed.\n progress_deadline = \"10m\"\n\n\n # The \"canary\" parameter specifies that changes to the job that would result\n # in destructive updates should create the specified number of canaries\n # without stopping any previous allocations. Once the operator determines the\n # canaries are healthy, they can be promoted which unblocks a rolling update\n # of the remaining allocations at a rate of \"max_parallel\".\n #\n # Further, setting \"canary\" equal to the count of the task group allows\n # blue/green deployments. When the job is updated, a full set of the new\n # version is deployed and upon promotion the old version is stopped.\n canary = 1\n\n # Specifies if the job should auto-promote to the canary version when all\n # canaries become healthy during a deployment. Defaults to false which means\n # canaries must be manually updated with the nomad deployment promote\n # command.\n auto_promote = true\n\n # The \"auto_revert\" parameter specifies if the job should auto-revert to the\n # last stable job on deployment failure. A job is marked as stable if all the\n # allocations as part of its deployment were marked healthy.\n auto_revert = true\n\n }\n\n # The \"group\" stanza defines a series of tasks that should be co-located on\n # the same Nomad client. Any task within a group will be placed on the same\n # client.\n #\n # For more information and examples on the \"group\" stanza, please see\n # the online documentation at:\n #\n # https://www.nomadproject.io/docs/job-specification/group\n #\n group \"prod-group1-grafana\" {\n # The \"count\" parameter specifies the number of the task groups that should\n # be running under this group. This value must be non-negative and defaults\n # to 1.\n count = 1\n\n\n # The constraint allows restricting the set of eligible nodes. Constraints\n # may filter on attributes or client metadata.\n #\n # For more information and examples on the \"volume\" stanza, please see\n # the online documentation at:\n #\n # https://www.nomadproject.io/docs/job-specification/constraint\n #\n constraint {\n attribute = \"${attr.cpu.arch}\"\n operator = \"!=\"\n value = \"arm64\"\n }\n\n # The \"task\" stanza creates an individual unit of work, such as a Docker\n # container, web application, or batch processing.\n #\n # For more information and examples on the \"task\" stanza, please see\n # the online documentation at:\n #\n # https://www.nomadproject.io/docs/job-specification/task\n #\n task \"prod-task1-grafana\" {\n # The \"driver\" parameter specifies the task driver that should be used to\n # run the task.\n driver = \"docker\"\n\n # The \"config\" stanza specifies the driver configuration, which is passed\n # directly to the driver to start the task. The details of configurations\n # are specific to each driver, so please see specific driver\n # documentation for more information.\n config {\n image = \"grafana/grafana:7.3.7\"\n dns_servers = [ \"${attr.unique.network.ip-address}\" ]\n volumes = [\n \"secrets/prometheus.yml:/etc/grafana/provisioning/datasources/prometheus.yml\",\n \"secrets/dashboards.yml:/etc/grafana/provisioning/dashboards/dashboards.yml\",\n \"secrets/grafana.ini:/etc/grafana/grafana.ini\",\n \"secrets/node_exporter.json:/etc/grafana/provisioning/dashboards/node_exporter.json\",\n \"secrets/docker_cadvisor.json:/etc/grafana/provisioning/dashboards/docker_cadvisor.json\",\n \"secrets/nomad.json:/etc/grafana/provisioning/dashboards/nomad.json\",\n \"secrets/consul.json:/etc/grafana/provisioning/dashboards/consul.json\",\n \"secrets/prometheus.json:/etc/grafana/provisioning/dashboards/prometheus.json\",\n \"secrets/blackbox_exporter_http.json:/etc/grafana/provisioning/dashboards/blackbox_exporter_http.json\",\n \"secrets/blackbox_exporter_icmp.json:/etc/grafana/provisioning/dashboards/blackbox_exporter_icmp.json\"\n ]\n }\n\n artifact {\n # Prometheus Node Exporter\n source = \"https://raw.githubusercontent.com/pmikus/grafana-dashboards/main/node_exporter.json\"\n destination = \"secrets/\"\n }\n\n artifact {\n # Docker cAdvisor\n source = \"https://raw.githubusercontent.com/pmikus/grafana-dashboards/main/docker_cadvisor.json\"\n destination = \"secrets/\"\n }\n\n artifact {\n # Nomad\n source = \"https://raw.githubusercontent.com/pmikus/grafana-dashboards/main/nomad.json\"\n destination = \"secrets/\"\n }\n\n artifact {\n # Consul\n source = \"https://raw.githubusercontent.com/pmikus/grafana-dashboards/main/consul.json\"\n destination = \"secrets/\"\n }\n\n artifact {\n # Prometheus\n source = \"https://raw.githubusercontent.com/pmikus/grafana-dashboards/main/prometheus.json\"\n destination = \"secrets/\"\n }\n\n artifact {\n # Prometheus Blackbox Exporter HTTP\n source = \"https://raw.githubusercontent.com/pmikus/grafana-dashboards/main/blackbox_exporter_http.json\"\n destination = \"secrets/\"\n }\n\n artifact {\n # Prometheus Blackbox Exporter ICMP\n source = \"https://raw.githubusercontent.com/pmikus/grafana-dashboards/main/blackbox_exporter_icmp.json\"\n destination = \"secrets/\"\n }\n\n # The \"template\" stanza instructs Nomad to manage a template, such as\n # a configuration file or script. This template can optionally pull data\n # from Consul or Vault to populate runtime configuration data.\n #\n # For more information and examples on the \"template\" stanza, please see\n # the online documentation at:\n #\n # https://www.nomadproject.io/docs/job-specification/template\n #\n template {\n change_mode = \"noop\"\n change_signal = \"SIGINT\"\n destination = \"secrets/prometheus.yml\"\n data = \u003c\u003cEOH\napiVersion: 1\ndatasources:\n- name: Prometheus\n type: prometheus\n access: direct\n orgId: 1\n url: http://prometheus.service.consul:9090\n basicAuth: false\n isDefault: true\n version: 1\n editable: false\nEOH\n }\n\n template {\n change_mode = \"noop\"\n change_signal = \"SIGINT\"\n destination = \"secrets/dashboards.yml\"\n data = \u003c\u003cEOH\napiVersion: 1\nproviders:\n- name: dashboards\n type: file\n disableDeletion: false\n updateIntervalSeconds: 10\n allowUiUpdates: false\n options:\n path: /etc/grafana/provisioning/dashboards\n foldersFromFilesStructure: true\nEOH\n }\n\n template {\n change_mode = \"noop\"\n change_signal = \"SIGINT\"\n destination = \"secrets/grafana.ini\"\n data = \u003c\u003cEOH\napp_mode = production\n\n[metrics]\nenabled = true\n\n[server]\nprotocol = http\nhttp_port = 3000\nroot_url = http://grafana.service.consul:3000\nenable_gzip = true\n;cert_file =\n;cert_key =\n\n[security]\nadmin_user = grafanauser\nadmin_password = Grafana1234\nsecret_key = SW2YcwTIb9zpOOhoPsMm\n\n[users]\nallow_sign_up = false\nallow_org_create = false\nauto_assign_org = true\nauto_assign_org_role = Viewer\ndefault_theme = dark\n\n[auth.basic]\nenabled = true\n\n[auth]\ndisable_login_form = false\ndisable_signout_menu = false\n\n[auth.anonymous]\nenabled = false\n\n[log]\nmode = console\nlevel = info\n\n[log.console]\nlevel = info\nformat = console\nEOH\n }\n\n # The service stanza instructs Nomad to register a service with Consul.\n #\n # For more information and examples on the \"task\" stanza, please see\n # the online documentation at:\n #\n # https://www.nomadproject.io/docs/job-specification/service\n #\n service {\n name = \"grafana\"\n port = \"grafana\"\n tags = [ \"grafana${NOMAD_ALLOC_INDEX}\" ]\n check {\n name = \"Grafana Check Live\"\n type = \"http\"\n protocol = \"http\"\n tls_skip_verify = true\n path = \"/api/health\"\n interval = \"10s\"\n timeout = \"2s\"\n }\n }\n\n # The \"resources\" stanza describes the requirements a task needs to\n # execute. Resource requirements include memory, network, cpu, and more.\n # This ensures the task will execute on a machine that contains enough\n # resource capacity.\n #\n # For more information and examples on the \"resources\" stanza, please see\n # the online documentation at:\n #\n # https://www.nomadproject.io/docs/job-specification/resources\n #\n resources {\n cpu = 1000\n memory = 2048\n # The network stanza specifies the networking requirements for the task\n # group, including the network mode and port allocations. When scheduling\n # jobs in Nomad they are provisioned across your fleet of machines along\n # with other jobs and services. Because you don't know in advance what host\n # your job will be provisioned on, Nomad will provide your tasks with\n # network configuration when they start up.\n #\n # For more information and examples on the \"template\" stanza, please see\n # the online documentation at:\n #\n # https://www.nomadproject.io/docs/job-specification/network\n #\n network {\n port \"grafana\" {\n static = 3000\n }\n }\n }\n }\n }\n}", "json": null, - "modify_index": "7148410", + "modify_index": "7148913", "name": "prod-grafana", "namespace": "default", "policy_override": null, @@ -415,11 +369,10 @@ "provider": "provider[\"registry.terraform.io/hashicorp/nomad\"].yul1", "instances": [ { + "status": "tainted", "schema_version": 0, "attributes": { - "allocation_ids": [ - "8ca5aa5b-2616-f167-f40e-a0becb50c87e" - ], + "allocation_ids": null, "datacenters": [ "yul1" ], @@ -431,12 +384,12 @@ "id": "prod-mc", "jobspec": "job \"prod-mc\" {\n # The \"region\" parameter specifies the region in which to execute the job.\n # If omitted, this inherits the default region name of \"global\".\n # region = \"global\"\n #\n # The \"datacenters\" parameter specifies the list of datacenters which should\n # be considered when placing this task. This must be provided.\n datacenters = \"yul1\"\n\n # The \"type\" parameter controls the type of job, which impacts the scheduler's\n # decision on placement. This configuration is optional and defaults to\n # \"service\". For a full list of job types and their differences, please see\n # the online documentation.\n #\n # For more information, please see the online documentation at:\n #\n # https://www.nomadproject.io/docs/jobspec/schedulers.html\n #\n type = \"batch\"\n\n # The \"group\" stanza defines a series of tasks that should be co-located on\n # the same Nomad client. Any task within a group will be placed on the same\n # client.\n #\n # For more information and examples on the \"group\" stanza, please see\n # the online documentation at:\n #\n # https://www.nomadproject.io/docs/job-specification/group.html\n #\n group \"prod-group1-mc\" {\n task \"prod-task1-create-buckets\" {\n # The \"driver\" parameter specifies the task driver that should be used to\n # run the task.\n driver = \"docker\"\n\n \n\n # The \"config\" stanza specifies the driver configuration, which is passed\n # directly to the driver to start the task. The details of configurations\n # are specific to each driver, so please see specific driver\n # documentation for more information.\n config {\n image = \"minio/mc:RELEASE.2020-12-10T01-26-17Z\"\n entrypoint = [\n \"/bin/sh\",\n \"-c\",\n \"mc config host add LOCALMINIO http://storage.service.consul:9000 $MINIO_ACCESS_KEY $MINIO_SECRET_KEY \u0026\u0026 mc mb -p LOCALMINIO/logs.fd.io LOCALMINIO/docs.fd.io ; mc policy set public LOCALMINIO/logs.fd.io mc policy set public LOCALMINIO/docs.fd.io mc ilm add --expiry-days '180' LOCALMINIO/logs.fd.io mc admin user add LOCALMINIO storage Storage1234 mc admin policy set LOCALMINIO writeonly user=storage\"\n ]\n dns_servers = [ \"${attr.unique.network.ip-address}\" ]\n privileged = false\n }\n\n # The env stanza configures a list of environment variables to populate\n # the task's environment before starting.\n env {\n \n MINIO_ACCESS_KEY = \"minio\"\n MINIO_SECRET_KEY = \"minio123\"\n \n \n }\n }\n }\n}\n", "json": null, - "modify_index": "7148419", + "modify_index": "7201202", "name": "prod-mc", "namespace": "default", "policy_override": null, "purge_on_destroy": null, - "region": "global", + "region": null, "task_groups": [ { "count": 1, @@ -476,15 +429,15 @@ "schema_version": 0, "attributes": { "allocation_ids": [ - "322e6ea5-acc7-2f37-3022-cd0a31c2f3db", - "619833b9-dd70-46ae-f576-75c3cd058a13", - "0510561a-c828-dd23-5910-b92d9f2c41ef", - "504924c8-387b-abe2-c4d5-d6a7424a4689" + "72adbff3-2fd7-b3f8-9f19-be9511f91a26", + "fcc12ad6-caed-632d-1c4a-14fe0dc9741e", + "89fcea10-d56b-ab07-e2b8-435c118e800b", + "1fe6b306-ee08-415c-fbef-cdcdf3fbc2ec" ], "datacenters": [ "yul1" ], - "deployment_id": "cd6d79cf-6e6f-289d-a0bb-6b2bb8da769a", + "deployment_id": "93fcf42d-2ab9-b10b-9184-b99e08621504", "deployment_status": "successful", "deregister_on_destroy": true, "deregister_on_id_change": true, @@ -571,7 +524,7 @@ "schema_version": 0, "attributes": { "allocation_ids": [ - "1dea6de0-d5cd-2a1b-fb42-0468e986e0d6" + "f576ae38-8398-dfa2-bc6c-18355896ed2f" ], "datacenters": [ "yul1" @@ -627,9 +580,9 @@ "schema_version": 0, "attributes": { "filename": null, - "id": "cd26d062785357d96c95d6f8f40b2d8d7b430be3399de176e6b250822c4af86c", - "rendered": "job \"prod-prometheus\" {\n # The \"region\" parameter specifies the region in which to execute the job.\n # If omitted, this inherits the default region name of \"global\".\n # region = \"global\"\n #\n # The \"datacenters\" parameter specifies the list of datacenters which should\n # be considered when placing this task. This must be provided.\n datacenters = \"yul1\"\n\n # The \"type\" parameter controls the type of job, which impacts the scheduler's\n # decision on placement. This configuration is optional and defaults to\n # \"service\". For a full list of job types and their differences, please see\n # the online documentation.\n #\n # For more information, please see the online documentation at:\n #\n # https://www.nomadproject.io/docs/jobspec/schedulers\n #\n type = \"service\"\n\n update {\n # The \"max_parallel\" parameter specifies the maximum number of updates to\n # perform in parallel. In this case, this specifies to update a single task\n # at a time.\n max_parallel = 1\n\n health_check = \"checks\"\n\n # The \"min_healthy_time\" parameter specifies the minimum time the allocation\n # must be in the healthy state before it is marked as healthy and unblocks\n # further allocations from being updated.\n min_healthy_time = \"10s\"\n\n # The \"healthy_deadline\" parameter specifies the deadline in which the\n # allocation must be marked as healthy after which the allocation is\n # automatically transitioned to unhealthy. Transitioning to unhealthy will\n # fail the deployment and potentially roll back the job if \"auto_revert\" is\n # set to true.\n healthy_deadline = \"3m\"\n\n # The \"progress_deadline\" parameter specifies the deadline in which an\n # allocation must be marked as healthy. The deadline begins when the first\n # allocation for the deployment is created and is reset whenever an allocation\n # as part of the deployment transitions to a healthy state. If no allocation\n # transitions to the healthy state before the progress deadline, the\n # deployment is marked as failed.\n progress_deadline = \"10m\"\n\n\n # The \"canary\" parameter specifies that changes to the job that would result\n # in destructive updates should create the specified number of canaries\n # without stopping any previous allocations. Once the operator determines the\n # canaries are healthy, they can be promoted which unblocks a rolling update\n # of the remaining allocations at a rate of \"max_parallel\".\n #\n # Further, setting \"canary\" equal to the count of the task group allows\n # blue/green deployments. When the job is updated, a full set of the new\n # version is deployed and upon promotion the old version is stopped.\n canary = 1\n\n # Specifies if the job should auto-promote to the canary version when all\n # canaries become healthy during a deployment. Defaults to false which means\n # canaries must be manually updated with the nomad deployment promote\n # command.\n auto_promote = true\n\n # The \"auto_revert\" parameter specifies if the job should auto-revert to the\n # last stable job on deployment failure. A job is marked as stable if all the\n # allocations as part of its deployment were marked healthy.\n auto_revert = true\n\n }\n\n # The \"group\" stanza defines a series of tasks that should be co-located on\n # the same Nomad client. Any task within a group will be placed on the same\n # client.\n #\n # For more information and examples on the \"group\" stanza, please see\n # the online documentation at:\n #\n # https://www.nomadproject.io/docs/job-specification/group\n #\n group \"prod-group1-prometheus\" {\n # The \"count\" parameter specifies the number of the task groups that should\n # be running under this group. This value must be non-negative and defaults\n # to 1.\n count = 4\n\n # The volume stanza allows the group to specify that it requires a given\n # volume from the cluster.\n #\n # For more information and examples on the \"volume\" stanza, please see\n # the online documentation at:\n #\n # https://www.nomadproject.io/docs/job-specification/volume\n #\n \n volume \"prod-volume1-prometheus\" {\n type = \"host\"\n read_only = false\n source = \"prod-volume-data1-1\"\n }\n \n\n # The constraint allows restricting the set of eligible nodes. Constraints\n # may filter on attributes or client metadata.\n #\n # For more information and examples on the \"volume\" stanza, please see\n # the online documentation at:\n #\n # https://www.nomadproject.io/docs/job-specification/constraint\n #\n constraint {\n attribute = \"${attr.cpu.arch}\"\n operator = \"!=\"\n value = \"arm64\"\n }\n\n # The \"task\" stanza creates an individual unit of work, such as a Docker\n # container, web application, or batch processing.\n #\n # For more information and examples on the \"task\" stanza, please see\n # the online documentation at:\n #\n # https://www.nomadproject.io/docs/job-specification/task\n #\n task \"prod-task1-prometheus\" {\n # The \"driver\" parameter specifies the task driver that should be used to\n # run the task.\n driver = \"exec\"\n\n \n volume_mount {\n volume = \"prod-volume1-prometheus\"\n destination = \"/data/\"\n read_only = false\n }\n \n\n \n\n # The \"config\" stanza specifies the driver configuration, which is passed\n # directly to the driver to start the task. The details of configurations\n # are specific to each driver, so please see specific driver\n # documentation for more information.\n config {\n command = \"local/prometheus-2.24.0.linux-amd64/prometheus\"\n args = [\n \"--config.file=secrets/prometheus.yml\",\n \"--storage.tsdb.path=/data/prometheus/\",\n \"--storage.tsdb.retention.time=15d\"\n ]\n }\n\n # The artifact stanza instructs Nomad to fetch and unpack a remote resource,\n # such as a file, tarball, or binary. Nomad downloads artifacts using the\n # popular go-getter library, which permits downloading artifacts from a\n # variety of locations using a URL as the input source.\n #\n # For more information and examples on the \"artifact\" stanza, please see\n # the online documentation at:\n #\n # https://www.nomadproject.io/docs/job-specification/artifact\n #\n artifact {\n source = \"https://github.com/prometheus/prometheus/releases/download/v2.24.0/prometheus-2.24.0.linux-amd64.tar.gz\"\n }\n\n # The \"template\" stanza instructs Nomad to manage a template, such as\n # a configuration file or script. This template can optionally pull data\n # from Consul or Vault to populate runtime configuration data.\n #\n # For more information and examples on the \"template\" stanza, please see\n # the online documentation at:\n #\n # https://www.nomadproject.io/docs/job-specification/template\n #\n template {\n change_mode = \"noop\"\n change_signal = \"SIGINT\"\n destination = \"secrets/alerts.yml\"\n left_delimiter = \"{{{\"\n right_delimiter = \"}}}\"\n data = \u003c\u003cEOH\n---\ngroups:\n- name: \"Consul\"\n rules:\n - alert: ConsulServiceHealthcheckFailed\n expr: consul_catalog_service_node_healthy == 0\n for: 0m\n labels:\n severity: critical\n annotations:\n summary: \"Consul service healthcheck failed (instance {{ $labels.instance }}).\"\n description: \"Service: `{{ $labels.service_name }}` Healthcheck: `{{ $labels.service_id }}`.\"\n - alert: ConsulMissingMasterNode\n expr: consul_raft_peers \u003c 3\n for: 0m\n labels:\n severity: critical\n annotations:\n summary: \"Consul missing master node (instance {{ $labels.instance }}).\"\n description: \"Numbers of consul raft peers should be 3, in order to preserve quorum.\"\n - alert: ConsulAgentUnhealthy\n expr: consul_health_node_status{status=\"critical\"} == 1\n for: 0m\n labels:\n severity: critical\n annotations:\n summary: \"Consul agent unhealthy (instance {{ $labels.instance }}).\"\n description: \"A Consul agent is down.\"\n- name: \"Hosts\"\n rules:\n - alert: NodeDown\n expr: up == 0\n for: 0m\n labels:\n severity: critical\n annotations:\n summary: \"Prometheus target missing (instance {{ $labels.instance }}).\"\n description: \"A Prometheus target has disappeared. An exporter might be crashed.\"\n - alert: HostHighCpuLoad\n expr: 100 - (avg by(instance) (rate(node_cpu_seconds_total{mode=\"idle\"}[2m])) * 100) \u003e 80\n for: 0m\n labels:\n severity: warning\n annotations:\n summary: \"Host high CPU load (instance {{ $labels.instance }}).\"\n description: \"CPU load is \u003e 80%.\"\n - alert: HostOutOfMemory\n expr: node_memory_MemAvailable_bytes / node_memory_MemTotal_bytes * 100 \u003c 10\n for: 2m\n labels:\n severity: warning\n annotations:\n summary: \"Host out of memory (instance {{ $labels.instance }}).\"\n description: \"Node memory is filling up (\u003c 10% left).\"\n - alert: HostOomKillDetected\n expr: increase(node_vmstat_oom_kill[1m]) \u003e 0\n for: 0m\n labels:\n severity: warning\n annotations:\n summary: \"Host OOM kill detected (instance {{ $labels.instance }}).\"\n description: \"OOM kill detected.\"\n - alert: HostMemoryUnderMemoryPressure\n expr: rate(node_vmstat_pgmajfault[1m]) \u003e 1000\n for: 2m\n labels:\n severity: warning\n annotations:\n summary: \"Host memory under memory pressure (instance {{ $labels.instance }}).\"\n description: \"The node is under heavy memory pressure. High rate of major page faults.\"\n - alert: HostOutOfDiskSpace\n expr: (node_filesystem_avail_bytes * 100) / node_filesystem_size_bytes \u003c 10 and ON (instance, device, mountpoint) node_filesystem_readonly == 0\n for: 2m\n labels:\n severity: warning\n annotations:\n summary: \"Host out of disk space (instance {{ $labels.instance }}).\"\n description: \"Disk is almost full (\u003c 10% left).\"\n - alert: HostRaidDiskFailure\n expr: node_md_disks{state=\"failed\"} \u003e 0\n for: 2m\n labels:\n severity: warning\n annotations:\n summary: \"Host RAID disk failure (instance {{ $labels.instance }}).\"\n description: \"At least one device in RAID array on {{ $labels.instance }} failed. Array {{ $labels.md_device }} needs attention and possibly a disk swap.\"\n - alert: HostConntrackLimit\n expr: node_nf_conntrack_entries / node_nf_conntrack_entries_limit \u003e 0.8\n for: 5m\n labels:\n severity: warning\n annotations:\n summary: \"Host conntrack limit (instance {{ $labels.instance }}).\"\n description: \"The number of conntrack is approching limit.\"\n - alert: HostNetworkInterfaceSaturated\n expr: (rate(node_network_receive_bytes_total{device!~\"^tap.*\"}[1m]) + rate(node_network_transmit_bytes_total{device!~\"^tap.*\"}[1m])) / node_network_speed_bytes{device!~\"^tap.*\"} \u003e 0.8\n for: 1m\n labels:\n severity: warning\n annotations:\n summary: \"Host Network Interface Saturated (instance {{ $labels.instance }}).\"\n description: \"The network interface {{ $labels.interface }} on {{ $labels.instance }} is getting overloaded.\"\n - alert: HostSystemdServiceCrashed\n expr: node_systemd_unit_state{state=\"failed\"} == 1\n for: 0m\n labels:\n severity: warning\n annotations:\n summary: \"Host SystemD service crashed (instance {{ $labels.instance }}).\"\n description: \"SystemD service crashed.\"\n - alert: HostEdacCorrectableErrorsDetected\n expr: increase(node_edac_correctable_errors_total[1m]) \u003e 0\n for: 0m\n labels:\n severity: info\n annotations:\n summary: \"Host EDAC Correctable Errors detected (instance {{ $labels.instance }}).\"\n description: '{{ $labels.instance }} has had {{ printf \"%.0f\" $value }} correctable memory errors reported by EDAC in the last 5 minutes.'\n - alert: HostEdacUncorrectableErrorsDetected\n expr: node_edac_uncorrectable_errors_total \u003e 0\n for: 0m\n labels:\n severity: warning\n annotations:\n summary: \"Host EDAC Uncorrectable Errors detected (instance {{ $labels.instance }}).\"\n description: '{{ $labels.instance }} has had {{ printf \"%.0f\" $value }} uncorrectable memory errors reported by EDAC in the last 5 minutes.'\n- name: \"Min.io\"\n rules:\n - alert: MinioDiskOffline\n expr: minio_offline_disks \u003e 0\n for: 0m\n labels:\n severity: critical\n annotations:\n summary: \"Minio disk offline (instance {{ $labels.instance }})\"\n description: \"Minio disk is offline.\"\n - alert: MinioStorageSpaceExhausted\n expr: minio_disk_storage_free_bytes / 1024 / 1024 / 1024 \u003c 10\n for: 2m\n labels:\n severity: warning\n annotations:\n summary: \"Minio storage space exhausted (instance {{ $labels.instance }}).\"\n description: \"Minio storage space is low (\u003c 10 GB).\"\n- name: \"Prometheus\"\n rules:\n - alert: PrometheusConfigurationReloadFailure\n expr: prometheus_config_last_reload_successful != 1\n for: 0m\n labels:\n severity: warning\n annotations:\n summary: \"Prometheus configuration reload failure (instance {{ $labels.instance }}).\"\n description: \"Prometheus configuration reload error.\"\n - alert: PrometheusTooManyRestarts\n expr: changes(process_start_time_seconds{job=~\"prometheus|pushgateway|alertmanager\"}[15m]) \u003e 2\n for: 0m\n labels:\n severity: warning\n annotations:\n summary: \"Prometheus too many restarts (instance {{ $labels.instance }}).\"\n description: \"Prometheus has restarted more than twice in the last 15 minutes. It might be crashlooping.\"\n - alert: PrometheusAlertmanagerConfigurationReloadFailure\n expr: alertmanager_config_last_reload_successful != 1\n for: 0m\n labels:\n severity: warning\n annotations:\n summary: \"Prometheus AlertManager configuration reload failure (instance {{ $labels.instance }}).\"\n description: \"AlertManager configuration reload error.\"\n - alert: PrometheusRuleEvaluationFailures\n expr: increase(prometheus_rule_evaluation_failures_total[3m]) \u003e 0\n for: 0m\n labels:\n severity: critical\n annotations:\n summary: \"Prometheus rule evaluation failures (instance {{ $labels.instance }}).\"\n description: \"Prometheus encountered {{ $value }} rule evaluation failures, leading to potentially ignored alerts.\"\n - alert: PrometheusTargetScrapingSlow\n expr: prometheus_target_interval_length_seconds{quantile=\"0.9\"} \u003e 60\n for: 5m\n labels:\n severity: warning\n annotations:\n summary: \"Prometheus target scraping slow (instance {{ $labels.instance }}).\"\n description: \"Prometheus is scraping exporters slowly.\"\n - alert: PrometheusTsdbCompactionsFailed\n expr: increase(prometheus_tsdb_compactions_failed_total[1m]) \u003e 0\n for: 0m\n labels:\n severity: critical\n annotations:\n summary: \"Prometheus TSDB compactions failed (instance {{ $labels.instance }}).\"\n description: \"Prometheus encountered {{ $value }} TSDB compactions failures.\"\n - alert: PrometheusTsdbHeadTruncationsFailed\n expr: increase(prometheus_tsdb_head_truncations_failed_total[1m]) \u003e 0\n for: 0m\n labels:\n severity: critical\n annotations:\n summary: \"Prometheus TSDB head truncations failed (instance {{ $labels.instance }}).\"\n description: \"Prometheus encountered {{ $value }} TSDB head truncation failures.\"\n - alert: PrometheusTsdbWalCorruptions\n expr: increase(prometheus_tsdb_wal_corruptions_total[1m]) \u003e 0\n for: 0m\n labels:\n severity: critical\n annotations:\n summary: \"Prometheus TSDB WAL corruptions (instance {{ $labels.instance }}).\"\n description: \"Prometheus encountered {{ $value }} TSDB WAL corruptions.\"\n - alert: PrometheusTsdbWalTruncationsFailed\n expr: increase(prometheus_tsdb_wal_truncations_failed_total[1m]) \u003e 0\n for: 0m\n labels:\n severity: critical\n annotations:\n summary: \"Prometheus TSDB WAL truncations failed (instance {{ $labels.instance }}).\"\n description: \"Prometheus encountered {{ $value }} TSDB WAL truncation failures.\"\nEOH\n }\n\n template {\n change_mode = \"noop\"\n change_signal = \"SIGINT\"\n destination = \"secrets/prometheus.yml\"\n data = \u003c\u003cEOH\n---\nglobal:\n scrape_interval: 5s\n scrape_timeout: 5s\n evaluation_interval: 5s\n\nalerting:\n alertmanagers:\n - consul_sd_configs:\n - server: '{{ env \"NOMAD_IP_prometheus\" }}:8500'\n services: [ 'alertmanager' ]\n\nrule_files:\n - 'alerts.yml'\n\nscrape_configs:\n\n - job_name: 'Nomad Cluster'\n consul_sd_configs:\n - server: '{{ env \"NOMAD_IP_prometheus\" }}:8500'\n services: [ 'nomad-client', 'nomad' ]\n relabel_configs:\n - source_labels: [__meta_consul_tags]\n regex: '(.*)http(.*)'\n action: keep\n metrics_path: /v1/metrics\n params:\n format: [ 'prometheus' ]\n\n - job_name: 'Consul Cluster'\n static_configs:\n - targets: [ '10.30.51.30:8500', '10.30.51.32:8500', '10.30.51.33:8500' ]\n metrics_path: /v1/agent/metrics\n params:\n format: [ 'prometheus' ]\n\n - job_name: 'Alertmanager'\n consul_sd_configs:\n - server: '{{ env \"NOMAD_IP_prometheus\" }}:8500'\n services: [ 'alertmanager' ]\n\n - job_name: 'Blackbox Exporter (icmp)'\n static_configs:\n - targets: [ 'gerrit.fd.io' ]\n - targets: [ 'jenkins.fd.io' ]\n - targets: [ '10.30.51.32' ]\n params:\n module: [ 'icmp_v4' ]\n relabel_configs:\n - source_labels: [__address__]\n target_label: __param_target\n - source_labels: [__param_target]\n target_label: instance\n - target_label: __address__\n replacement: localhost:9115\n metrics_path: /probe\n\n - job_name: 'Blackbox Exporter (http)'\n static_configs:\n - targets: [ 'gerrit.fd.io' ]\n - targets: [ 'jenkins.fd.io' ]\n params:\n module: [ 'http_2xx' ]\n relabel_configs:\n - source_labels: [__address__]\n target_label: __param_target\n - source_labels: [__param_target]\n target_label: instance\n - target_label: __address__\n replacement: localhost:9115\n metrics_path: /probe\n\n - job_name: 'cAdvisor Exporter'\n consul_sd_configs:\n - server: '{{ env \"NOMAD_IP_prometheus\" }}:8500'\n services: [ 'cadvisorexporter' ]\n\n - job_name: 'Grafana'\n consul_sd_configs:\n - server: '{{ env \"NOMAD_IP_prometheus\" }}:8500'\n services: [ 'grafana' ]\n\n - job_name: 'Node Exporter'\n consul_sd_configs:\n - server: '{{ env \"NOMAD_IP_prometheus\" }}:8500'\n services: [ 'nodeexporter' ]\n\n - job_name: 'Prometheus'\n consul_sd_configs:\n - server: '{{ env \"NOMAD_IP_prometheus\" }}:8500'\n services: [ 'prometheus' ]\n\n - job_name: 'Minio'\n bearer_token: eyJhbGciOiJIUzUxMiIsInR5cCI6IkpXVCJ9.eyJleHAiOjQ3NjQ1ODEzMzcsImlzcyI6InByb21ldGhldXMiLCJzdWIiOiJtaW5pbyJ9.oeTw3EIaiFmlDikrHXWiWXMH2vxLfDLkfjEC7G2N3M_keH_xyA_l2ofLLNYtopa_3GCEZnxLQdPuFZrmgpkDWg\n consul_sd_configs:\n - server: '{{ env \"NOMAD_IP_prometheus\" }}:8500'\n services: [ 'storage' ]\n metrics_path: /minio/prometheus/metrics\nEOH\n }\n\n # The service stanza instructs Nomad to register a service with Consul.\n #\n # For more information and examples on the \"task\" stanza, please see\n # the online documentation at:\n #\n # https://www.nomadproject.io/docs/job-specification/service\n #\n service {\n name = \"prometheus\"\n port = \"prometheus\"\n tags = [ \"prometheus${NOMAD_ALLOC_INDEX}\" ]\n check {\n name = \"Prometheus Check Live\"\n type = \"http\"\n path = \"/-/healthy\"\n interval = \"10s\"\n timeout = \"2s\"\n }\n }\n\n # The \"resources\" stanza describes the requirements a task needs to\n # execute. Resource requirements include memory, network, cpu, and more.\n # This ensures the task will execute on a machine that contains enough\n # resource capacity.\n #\n # For more information and examples on the \"resources\" stanza, please see\n # the online documentation at:\n #\n # https://www.nomadproject.io/docs/job-specification/resources\n #\n resources {\n cpu = 2000\n memory = 8192\n # The network stanza specifies the networking requirements for the task\n # group, including the network mode and port allocations. When scheduling\n # jobs in Nomad they are provisioned across your fleet of machines along\n # with other jobs and services. Because you don't know in advance what host\n # your job will be provisioned on, Nomad will provide your tasks with\n # network configuration when they start up.\n #\n # For more information and examples on the \"template\" stanza, please see\n # the online documentation at:\n #\n # https://www.nomadproject.io/docs/job-specification/network\n #\n network {\n port \"prometheus\" {\n static = 9090\n }\n }\n }\n }\n }\n}", - "template": "job \"${job_name}\" {\n # The \"region\" parameter specifies the region in which to execute the job.\n # If omitted, this inherits the default region name of \"global\".\n # region = \"global\"\n #\n # The \"datacenters\" parameter specifies the list of datacenters which should\n # be considered when placing this task. This must be provided.\n datacenters = \"${datacenters}\"\n\n # The \"type\" parameter controls the type of job, which impacts the scheduler's\n # decision on placement. This configuration is optional and defaults to\n # \"service\". For a full list of job types and their differences, please see\n # the online documentation.\n #\n # For more information, please see the online documentation at:\n #\n # https://www.nomadproject.io/docs/jobspec/schedulers\n #\n type = \"service\"\n\n update {\n # The \"max_parallel\" parameter specifies the maximum number of updates to\n # perform in parallel. In this case, this specifies to update a single task\n # at a time.\n max_parallel = 1\n\n health_check = \"checks\"\n\n # The \"min_healthy_time\" parameter specifies the minimum time the allocation\n # must be in the healthy state before it is marked as healthy and unblocks\n # further allocations from being updated.\n min_healthy_time = \"10s\"\n\n # The \"healthy_deadline\" parameter specifies the deadline in which the\n # allocation must be marked as healthy after which the allocation is\n # automatically transitioned to unhealthy. Transitioning to unhealthy will\n # fail the deployment and potentially roll back the job if \"auto_revert\" is\n # set to true.\n healthy_deadline = \"3m\"\n\n # The \"progress_deadline\" parameter specifies the deadline in which an\n # allocation must be marked as healthy. The deadline begins when the first\n # allocation for the deployment is created and is reset whenever an allocation\n # as part of the deployment transitions to a healthy state. If no allocation\n # transitions to the healthy state before the progress deadline, the\n # deployment is marked as failed.\n progress_deadline = \"10m\"\n\n%{ if use_canary }\n # The \"canary\" parameter specifies that changes to the job that would result\n # in destructive updates should create the specified number of canaries\n # without stopping any previous allocations. Once the operator determines the\n # canaries are healthy, they can be promoted which unblocks a rolling update\n # of the remaining allocations at a rate of \"max_parallel\".\n #\n # Further, setting \"canary\" equal to the count of the task group allows\n # blue/green deployments. When the job is updated, a full set of the new\n # version is deployed and upon promotion the old version is stopped.\n canary = 1\n\n # Specifies if the job should auto-promote to the canary version when all\n # canaries become healthy during a deployment. Defaults to false which means\n # canaries must be manually updated with the nomad deployment promote\n # command.\n auto_promote = true\n\n # The \"auto_revert\" parameter specifies if the job should auto-revert to the\n # last stable job on deployment failure. A job is marked as stable if all the\n # allocations as part of its deployment were marked healthy.\n auto_revert = true\n%{ endif }\n }\n\n # The \"group\" stanza defines a series of tasks that should be co-located on\n # the same Nomad client. Any task within a group will be placed on the same\n # client.\n #\n # For more information and examples on the \"group\" stanza, please see\n # the online documentation at:\n #\n # https://www.nomadproject.io/docs/job-specification/group\n #\n group \"prod-group1-${service_name}\" {\n # The \"count\" parameter specifies the number of the task groups that should\n # be running under this group. This value must be non-negative and defaults\n # to 1.\n count = ${group_count}\n\n # The volume stanza allows the group to specify that it requires a given\n # volume from the cluster.\n #\n # For more information and examples on the \"volume\" stanza, please see\n # the online documentation at:\n #\n # https://www.nomadproject.io/docs/job-specification/volume\n #\n %{ if use_host_volume }\n volume \"prod-volume1-${service_name}\" {\n type = \"host\"\n read_only = false\n source = \"${host_volume}\"\n }\n %{ endif }\n\n # The constraint allows restricting the set of eligible nodes. Constraints\n # may filter on attributes or client metadata.\n #\n # For more information and examples on the \"volume\" stanza, please see\n # the online documentation at:\n #\n # https://www.nomadproject.io/docs/job-specification/constraint\n #\n constraint {\n attribute = \"$${attr.cpu.arch}\"\n operator = \"!=\"\n value = \"arm64\"\n }\n\n # The \"task\" stanza creates an individual unit of work, such as a Docker\n # container, web application, or batch processing.\n #\n # For more information and examples on the \"task\" stanza, please see\n # the online documentation at:\n #\n # https://www.nomadproject.io/docs/job-specification/task\n #\n task \"prod-task1-${service_name}\" {\n # The \"driver\" parameter specifies the task driver that should be used to\n # run the task.\n driver = \"exec\"\n\n %{ if use_host_volume }\n volume_mount {\n volume = \"prod-volume1-${service_name}\"\n destination = \"${data_dir}\"\n read_only = false\n }\n %{ endif }\n\n %{ if use_vault_provider }\n vault {\n policies = \"${vault_kv_policy_name}\"\n }\n %{ endif }\n\n # The \"config\" stanza specifies the driver configuration, which is passed\n # directly to the driver to start the task. The details of configurations\n # are specific to each driver, so please see specific driver\n # documentation for more information.\n config {\n command = \"local/prometheus-${version}.linux-amd64/prometheus\"\n args = [\n \"--config.file=secrets/prometheus.yml\",\n \"--storage.tsdb.path=${data_dir}prometheus/\",\n \"--storage.tsdb.retention.time=15d\"\n ]\n }\n\n # The artifact stanza instructs Nomad to fetch and unpack a remote resource,\n # such as a file, tarball, or binary. Nomad downloads artifacts using the\n # popular go-getter library, which permits downloading artifacts from a\n # variety of locations using a URL as the input source.\n #\n # For more information and examples on the \"artifact\" stanza, please see\n # the online documentation at:\n #\n # https://www.nomadproject.io/docs/job-specification/artifact\n #\n artifact {\n source = \"${url}\"\n }\n\n # The \"template\" stanza instructs Nomad to manage a template, such as\n # a configuration file or script. This template can optionally pull data\n # from Consul or Vault to populate runtime configuration data.\n #\n # For more information and examples on the \"template\" stanza, please see\n # the online documentation at:\n #\n # https://www.nomadproject.io/docs/job-specification/template\n #\n template {\n change_mode = \"noop\"\n change_signal = \"SIGINT\"\n destination = \"secrets/alerts.yml\"\n left_delimiter = \"{{{\"\n right_delimiter = \"}}}\"\n data = \u003c\u003cEOH\n---\ngroups:\n- name: \"Consul\"\n rules:\n - alert: ConsulServiceHealthcheckFailed\n expr: consul_catalog_service_node_healthy == 0\n for: 0m\n labels:\n severity: critical\n annotations:\n summary: \"Consul service healthcheck failed (instance {{ $labels.instance }}).\"\n description: \"Service: `{{ $labels.service_name }}` Healthcheck: `{{ $labels.service_id }}`.\"\n - alert: ConsulMissingMasterNode\n expr: consul_raft_peers \u003c 3\n for: 0m\n labels:\n severity: critical\n annotations:\n summary: \"Consul missing master node (instance {{ $labels.instance }}).\"\n description: \"Numbers of consul raft peers should be 3, in order to preserve quorum.\"\n - alert: ConsulAgentUnhealthy\n expr: consul_health_node_status{status=\"critical\"} == 1\n for: 0m\n labels:\n severity: critical\n annotations:\n summary: \"Consul agent unhealthy (instance {{ $labels.instance }}).\"\n description: \"A Consul agent is down.\"\n- name: \"Hosts\"\n rules:\n - alert: NodeDown\n expr: up == 0\n for: 0m\n labels:\n severity: critical\n annotations:\n summary: \"Prometheus target missing (instance {{ $labels.instance }}).\"\n description: \"A Prometheus target has disappeared. An exporter might be crashed.\"\n - alert: HostHighCpuLoad\n expr: 100 - (avg by(instance) (rate(node_cpu_seconds_total{mode=\"idle\"}[2m])) * 100) \u003e 80\n for: 0m\n labels:\n severity: warning\n annotations:\n summary: \"Host high CPU load (instance {{ $labels.instance }}).\"\n description: \"CPU load is \u003e 80%.\"\n - alert: HostOutOfMemory\n expr: node_memory_MemAvailable_bytes / node_memory_MemTotal_bytes * 100 \u003c 10\n for: 2m\n labels:\n severity: warning\n annotations:\n summary: \"Host out of memory (instance {{ $labels.instance }}).\"\n description: \"Node memory is filling up (\u003c 10% left).\"\n - alert: HostOomKillDetected\n expr: increase(node_vmstat_oom_kill[1m]) \u003e 0\n for: 0m\n labels:\n severity: warning\n annotations:\n summary: \"Host OOM kill detected (instance {{ $labels.instance }}).\"\n description: \"OOM kill detected.\"\n - alert: HostMemoryUnderMemoryPressure\n expr: rate(node_vmstat_pgmajfault[1m]) \u003e 1000\n for: 2m\n labels:\n severity: warning\n annotations:\n summary: \"Host memory under memory pressure (instance {{ $labels.instance }}).\"\n description: \"The node is under heavy memory pressure. High rate of major page faults.\"\n - alert: HostOutOfDiskSpace\n expr: (node_filesystem_avail_bytes * 100) / node_filesystem_size_bytes \u003c 10 and ON (instance, device, mountpoint) node_filesystem_readonly == 0\n for: 2m\n labels:\n severity: warning\n annotations:\n summary: \"Host out of disk space (instance {{ $labels.instance }}).\"\n description: \"Disk is almost full (\u003c 10% left).\"\n - alert: HostRaidDiskFailure\n expr: node_md_disks{state=\"failed\"} \u003e 0\n for: 2m\n labels:\n severity: warning\n annotations:\n summary: \"Host RAID disk failure (instance {{ $labels.instance }}).\"\n description: \"At least one device in RAID array on {{ $labels.instance }} failed. Array {{ $labels.md_device }} needs attention and possibly a disk swap.\"\n - alert: HostConntrackLimit\n expr: node_nf_conntrack_entries / node_nf_conntrack_entries_limit \u003e 0.8\n for: 5m\n labels:\n severity: warning\n annotations:\n summary: \"Host conntrack limit (instance {{ $labels.instance }}).\"\n description: \"The number of conntrack is approching limit.\"\n - alert: HostNetworkInterfaceSaturated\n expr: (rate(node_network_receive_bytes_total{device!~\"^tap.*\"}[1m]) + rate(node_network_transmit_bytes_total{device!~\"^tap.*\"}[1m])) / node_network_speed_bytes{device!~\"^tap.*\"} \u003e 0.8\n for: 1m\n labels:\n severity: warning\n annotations:\n summary: \"Host Network Interface Saturated (instance {{ $labels.instance }}).\"\n description: \"The network interface {{ $labels.interface }} on {{ $labels.instance }} is getting overloaded.\"\n - alert: HostSystemdServiceCrashed\n expr: node_systemd_unit_state{state=\"failed\"} == 1\n for: 0m\n labels:\n severity: warning\n annotations:\n summary: \"Host SystemD service crashed (instance {{ $labels.instance }}).\"\n description: \"SystemD service crashed.\"\n - alert: HostEdacCorrectableErrorsDetected\n expr: increase(node_edac_correctable_errors_total[1m]) \u003e 0\n for: 0m\n labels:\n severity: info\n annotations:\n summary: \"Host EDAC Correctable Errors detected (instance {{ $labels.instance }}).\"\n description: '{{ $labels.instance }} has had {{ printf \"%.0f\" $value }} correctable memory errors reported by EDAC in the last 5 minutes.'\n - alert: HostEdacUncorrectableErrorsDetected\n expr: node_edac_uncorrectable_errors_total \u003e 0\n for: 0m\n labels:\n severity: warning\n annotations:\n summary: \"Host EDAC Uncorrectable Errors detected (instance {{ $labels.instance }}).\"\n description: '{{ $labels.instance }} has had {{ printf \"%.0f\" $value }} uncorrectable memory errors reported by EDAC in the last 5 minutes.'\n- name: \"Min.io\"\n rules:\n - alert: MinioDiskOffline\n expr: minio_offline_disks \u003e 0\n for: 0m\n labels:\n severity: critical\n annotations:\n summary: \"Minio disk offline (instance {{ $labels.instance }})\"\n description: \"Minio disk is offline.\"\n - alert: MinioStorageSpaceExhausted\n expr: minio_disk_storage_free_bytes / 1024 / 1024 / 1024 \u003c 10\n for: 2m\n labels:\n severity: warning\n annotations:\n summary: \"Minio storage space exhausted (instance {{ $labels.instance }}).\"\n description: \"Minio storage space is low (\u003c 10 GB).\"\n- name: \"Prometheus\"\n rules:\n - alert: PrometheusConfigurationReloadFailure\n expr: prometheus_config_last_reload_successful != 1\n for: 0m\n labels:\n severity: warning\n annotations:\n summary: \"Prometheus configuration reload failure (instance {{ $labels.instance }}).\"\n description: \"Prometheus configuration reload error.\"\n - alert: PrometheusTooManyRestarts\n expr: changes(process_start_time_seconds{job=~\"prometheus|pushgateway|alertmanager\"}[15m]) \u003e 2\n for: 0m\n labels:\n severity: warning\n annotations:\n summary: \"Prometheus too many restarts (instance {{ $labels.instance }}).\"\n description: \"Prometheus has restarted more than twice in the last 15 minutes. It might be crashlooping.\"\n - alert: PrometheusAlertmanagerConfigurationReloadFailure\n expr: alertmanager_config_last_reload_successful != 1\n for: 0m\n labels:\n severity: warning\n annotations:\n summary: \"Prometheus AlertManager configuration reload failure (instance {{ $labels.instance }}).\"\n description: \"AlertManager configuration reload error.\"\n - alert: PrometheusRuleEvaluationFailures\n expr: increase(prometheus_rule_evaluation_failures_total[3m]) \u003e 0\n for: 0m\n labels:\n severity: critical\n annotations:\n summary: \"Prometheus rule evaluation failures (instance {{ $labels.instance }}).\"\n description: \"Prometheus encountered {{ $value }} rule evaluation failures, leading to potentially ignored alerts.\"\n - alert: PrometheusTargetScrapingSlow\n expr: prometheus_target_interval_length_seconds{quantile=\"0.9\"} \u003e 60\n for: 5m\n labels:\n severity: warning\n annotations:\n summary: \"Prometheus target scraping slow (instance {{ $labels.instance }}).\"\n description: \"Prometheus is scraping exporters slowly.\"\n - alert: PrometheusTsdbCompactionsFailed\n expr: increase(prometheus_tsdb_compactions_failed_total[1m]) \u003e 0\n for: 0m\n labels:\n severity: critical\n annotations:\n summary: \"Prometheus TSDB compactions failed (instance {{ $labels.instance }}).\"\n description: \"Prometheus encountered {{ $value }} TSDB compactions failures.\"\n - alert: PrometheusTsdbHeadTruncationsFailed\n expr: increase(prometheus_tsdb_head_truncations_failed_total[1m]) \u003e 0\n for: 0m\n labels:\n severity: critical\n annotations:\n summary: \"Prometheus TSDB head truncations failed (instance {{ $labels.instance }}).\"\n description: \"Prometheus encountered {{ $value }} TSDB head truncation failures.\"\n - alert: PrometheusTsdbWalCorruptions\n expr: increase(prometheus_tsdb_wal_corruptions_total[1m]) \u003e 0\n for: 0m\n labels:\n severity: critical\n annotations:\n summary: \"Prometheus TSDB WAL corruptions (instance {{ $labels.instance }}).\"\n description: \"Prometheus encountered {{ $value }} TSDB WAL corruptions.\"\n - alert: PrometheusTsdbWalTruncationsFailed\n expr: increase(prometheus_tsdb_wal_truncations_failed_total[1m]) \u003e 0\n for: 0m\n labels:\n severity: critical\n annotations:\n summary: \"Prometheus TSDB WAL truncations failed (instance {{ $labels.instance }}).\"\n description: \"Prometheus encountered {{ $value }} TSDB WAL truncation failures.\"\nEOH\n }\n\n template {\n change_mode = \"noop\"\n change_signal = \"SIGINT\"\n destination = \"secrets/prometheus.yml\"\n data = \u003c\u003cEOH\n---\nglobal:\n scrape_interval: 5s\n scrape_timeout: 5s\n evaluation_interval: 5s\n\nalerting:\n alertmanagers:\n - consul_sd_configs:\n - server: '{{ env \"NOMAD_IP_prometheus\" }}:8500'\n services: [ 'alertmanager' ]\n\nrule_files:\n - 'alerts.yml'\n\nscrape_configs:\n\n - job_name: 'Nomad Cluster'\n consul_sd_configs:\n - server: '{{ env \"NOMAD_IP_prometheus\" }}:8500'\n services: [ 'nomad-client', 'nomad' ]\n relabel_configs:\n - source_labels: [__meta_consul_tags]\n regex: '(.*)http(.*)'\n action: keep\n metrics_path: /v1/metrics\n params:\n format: [ 'prometheus' ]\n\n - job_name: 'Consul Cluster'\n static_configs:\n - targets: [ '10.30.51.30:8500', '10.30.51.32:8500', '10.30.51.33:8500' ]\n metrics_path: /v1/agent/metrics\n params:\n format: [ 'prometheus' ]\n\n - job_name: 'Alertmanager'\n consul_sd_configs:\n - server: '{{ env \"NOMAD_IP_prometheus\" }}:8500'\n services: [ 'alertmanager' ]\n\n - job_name: 'Blackbox Exporter (icmp)'\n static_configs:\n - targets: [ 'gerrit.fd.io' ]\n - targets: [ 'jenkins.fd.io' ]\n - targets: [ '10.30.51.32' ]\n params:\n module: [ 'icmp_v4' ]\n relabel_configs:\n - source_labels: [__address__]\n target_label: __param_target\n - source_labels: [__param_target]\n target_label: instance\n - target_label: __address__\n replacement: localhost:9115\n metrics_path: /probe\n\n - job_name: 'Blackbox Exporter (http)'\n static_configs:\n - targets: [ 'gerrit.fd.io' ]\n - targets: [ 'jenkins.fd.io' ]\n params:\n module: [ 'http_2xx' ]\n relabel_configs:\n - source_labels: [__address__]\n target_label: __param_target\n - source_labels: [__param_target]\n target_label: instance\n - target_label: __address__\n replacement: localhost:9115\n metrics_path: /probe\n\n - job_name: 'cAdvisor Exporter'\n consul_sd_configs:\n - server: '{{ env \"NOMAD_IP_prometheus\" }}:8500'\n services: [ 'cadvisorexporter' ]\n\n - job_name: 'Grafana'\n consul_sd_configs:\n - server: '{{ env \"NOMAD_IP_prometheus\" }}:8500'\n services: [ 'grafana' ]\n\n - job_name: 'Node Exporter'\n consul_sd_configs:\n - server: '{{ env \"NOMAD_IP_prometheus\" }}:8500'\n services: [ 'nodeexporter' ]\n\n - job_name: 'Prometheus'\n consul_sd_configs:\n - server: '{{ env \"NOMAD_IP_prometheus\" }}:8500'\n services: [ 'prometheus' ]\n\n - job_name: 'Minio'\n bearer_token: eyJhbGciOiJIUzUxMiIsInR5cCI6IkpXVCJ9.eyJleHAiOjQ3NjQ1ODEzMzcsImlzcyI6InByb21ldGhldXMiLCJzdWIiOiJtaW5pbyJ9.oeTw3EIaiFmlDikrHXWiWXMH2vxLfDLkfjEC7G2N3M_keH_xyA_l2ofLLNYtopa_3GCEZnxLQdPuFZrmgpkDWg\n consul_sd_configs:\n - server: '{{ env \"NOMAD_IP_prometheus\" }}:8500'\n services: [ 'storage' ]\n metrics_path: /minio/prometheus/metrics\nEOH\n }\n\n # The service stanza instructs Nomad to register a service with Consul.\n #\n # For more information and examples on the \"task\" stanza, please see\n # the online documentation at:\n #\n # https://www.nomadproject.io/docs/job-specification/service\n #\n service {\n name = \"${service_name}\"\n port = \"${service_name}\"\n tags = [ \"${service_name}$${NOMAD_ALLOC_INDEX}\" ]\n check {\n name = \"Prometheus Check Live\"\n type = \"http\"\n path = \"/-/healthy\"\n interval = \"10s\"\n timeout = \"2s\"\n }\n }\n\n # The \"resources\" stanza describes the requirements a task needs to\n # execute. Resource requirements include memory, network, cpu, and more.\n # This ensures the task will execute on a machine that contains enough\n # resource capacity.\n #\n # For more information and examples on the \"resources\" stanza, please see\n # the online documentation at:\n #\n # https://www.nomadproject.io/docs/job-specification/resources\n #\n resources {\n cpu = ${cpu}\n memory = ${mem}\n # The network stanza specifies the networking requirements for the task\n # group, including the network mode and port allocations. When scheduling\n # jobs in Nomad they are provisioned across your fleet of machines along\n # with other jobs and services. Because you don't know in advance what host\n # your job will be provisioned on, Nomad will provide your tasks with\n # network configuration when they start up.\n #\n # For more information and examples on the \"template\" stanza, please see\n # the online documentation at:\n #\n # https://www.nomadproject.io/docs/job-specification/network\n #\n network {\n port \"${service_name}\" {\n static = ${port}\n }\n }\n }\n }\n }\n}", + "id": "f5c5b3316512492fa4e1caea185bafeeeb51b619e76b08ab473ca9523c2b8268", + "rendered": "job \"prod-prometheus\" {\n # The \"region\" parameter specifies the region in which to execute the job.\n # If omitted, this inherits the default region name of \"global\".\n # region = \"global\"\n #\n # The \"datacenters\" parameter specifies the list of datacenters which should\n # be considered when placing this task. This must be provided.\n datacenters = \"yul1\"\n\n # The \"type\" parameter controls the type of job, which impacts the scheduler's\n # decision on placement. This configuration is optional and defaults to\n # \"service\". For a full list of job types and their differences, please see\n # the online documentation.\n #\n # For more information, please see the online documentation at:\n #\n # https://www.nomadproject.io/docs/jobspec/schedulers\n #\n type = \"service\"\n\n update {\n # The \"max_parallel\" parameter specifies the maximum number of updates to\n # perform in parallel. In this case, this specifies to update a single task\n # at a time.\n max_parallel = 1\n\n health_check = \"checks\"\n\n # The \"min_healthy_time\" parameter specifies the minimum time the allocation\n # must be in the healthy state before it is marked as healthy and unblocks\n # further allocations from being updated.\n min_healthy_time = \"10s\"\n\n # The \"healthy_deadline\" parameter specifies the deadline in which the\n # allocation must be marked as healthy after which the allocation is\n # automatically transitioned to unhealthy. Transitioning to unhealthy will\n # fail the deployment and potentially roll back the job if \"auto_revert\" is\n # set to true.\n healthy_deadline = \"3m\"\n\n # The \"progress_deadline\" parameter specifies the deadline in which an\n # allocation must be marked as healthy. The deadline begins when the first\n # allocation for the deployment is created and is reset whenever an allocation\n # as part of the deployment transitions to a healthy state. If no allocation\n # transitions to the healthy state before the progress deadline, the\n # deployment is marked as failed.\n progress_deadline = \"10m\"\n\n\n # The \"canary\" parameter specifies that changes to the job that would result\n # in destructive updates should create the specified number of canaries\n # without stopping any previous allocations. Once the operator determines the\n # canaries are healthy, they can be promoted which unblocks a rolling update\n # of the remaining allocations at a rate of \"max_parallel\".\n #\n # Further, setting \"canary\" equal to the count of the task group allows\n # blue/green deployments. When the job is updated, a full set of the new\n # version is deployed and upon promotion the old version is stopped.\n canary = 1\n\n # Specifies if the job should auto-promote to the canary version when all\n # canaries become healthy during a deployment. Defaults to false which means\n # canaries must be manually updated with the nomad deployment promote\n # command.\n auto_promote = true\n\n # The \"auto_revert\" parameter specifies if the job should auto-revert to the\n # last stable job on deployment failure. A job is marked as stable if all the\n # allocations as part of its deployment were marked healthy.\n auto_revert = true\n\n }\n\n # The \"group\" stanza defines a series of tasks that should be co-located on\n # the same Nomad client. Any task within a group will be placed on the same\n # client.\n #\n # For more information and examples on the \"group\" stanza, please see\n # the online documentation at:\n #\n # https://www.nomadproject.io/docs/job-specification/group\n #\n group \"prod-group1-prometheus\" {\n # The \"count\" parameter specifies the number of the task groups that should\n # be running under this group. This value must be non-negative and defaults\n # to 1.\n count = 4\n\n # The volume stanza allows the group to specify that it requires a given\n # volume from the cluster.\n #\n # For more information and examples on the \"volume\" stanza, please see\n # the online documentation at:\n #\n # https://www.nomadproject.io/docs/job-specification/volume\n #\n \n volume \"prod-volume1-prometheus\" {\n type = \"host\"\n read_only = false\n source = \"prod-volume-data1-1\"\n }\n \n\n # The constraint allows restricting the set of eligible nodes. Constraints\n # may filter on attributes or client metadata.\n #\n # For more information and examples on the \"volume\" stanza, please see\n # the online documentation at:\n #\n # https://www.nomadproject.io/docs/job-specification/constraint\n #\n constraint {\n attribute = \"${attr.cpu.arch}\"\n operator = \"!=\"\n value = \"arm64\"\n }\n\n # The \"task\" stanza creates an individual unit of work, such as a Docker\n # container, web application, or batch processing.\n #\n # For more information and examples on the \"task\" stanza, please see\n # the online documentation at:\n #\n # https://www.nomadproject.io/docs/job-specification/task\n #\n task \"prod-task1-prometheus\" {\n # The \"driver\" parameter specifies the task driver that should be used to\n # run the task.\n driver = \"exec\"\n\n \n volume_mount {\n volume = \"prod-volume1-prometheus\"\n destination = \"/data/\"\n read_only = false\n }\n \n\n \n\n # The \"config\" stanza specifies the driver configuration, which is passed\n # directly to the driver to start the task. The details of configurations\n # are specific to each driver, so please see specific driver\n # documentation for more information.\n config {\n command = \"local/prometheus-2.24.0.linux-amd64/prometheus\"\n args = [\n \"--config.file=secrets/prometheus.yml\",\n \"--storage.tsdb.path=/data/prometheus/\",\n \"--storage.tsdb.retention.time=15d\"\n ]\n }\n\n # The artifact stanza instructs Nomad to fetch and unpack a remote resource,\n # such as a file, tarball, or binary. Nomad downloads artifacts using the\n # popular go-getter library, which permits downloading artifacts from a\n # variety of locations using a URL as the input source.\n #\n # For more information and examples on the \"artifact\" stanza, please see\n # the online documentation at:\n #\n # https://www.nomadproject.io/docs/job-specification/artifact\n #\n artifact {\n source = \"https://github.com/prometheus/prometheus/releases/download/v2.24.0/prometheus-2.24.0.linux-amd64.tar.gz\"\n }\n\n # The \"template\" stanza instructs Nomad to manage a template, such as\n # a configuration file or script. This template can optionally pull data\n # from Consul or Vault to populate runtime configuration data.\n #\n # For more information and examples on the \"template\" stanza, please see\n # the online documentation at:\n #\n # https://www.nomadproject.io/docs/job-specification/template\n #\n template {\n change_mode = \"noop\"\n change_signal = \"SIGINT\"\n destination = \"secrets/alerts.yml\"\n left_delimiter = \"{{{\"\n right_delimiter = \"}}}\"\n data = \u003c\u003cEOH\n---\ngroups:\n- name: \"Consul\"\n rules:\n - alert: ConsulServiceHealthcheckFailed\n expr: consul_catalog_service_node_healthy == 0\n for: 0m\n labels:\n severity: critical\n annotations:\n summary: \"Consul service healthcheck failed (instance {{ $labels.instance }}).\"\n description: \"Service: `{{ $labels.service_name }}` Healthcheck: `{{ $labels.service_id }}`.\"\n - alert: ConsulMissingMasterNode\n expr: consul_raft_peers \u003c 3\n for: 0m\n labels:\n severity: critical\n annotations:\n summary: \"Consul missing master node (instance {{ $labels.instance }}).\"\n description: \"Numbers of consul raft peers should be 3, in order to preserve quorum.\"\n - alert: ConsulAgentUnhealthy\n expr: consul_health_node_status{status=\"critical\"} == 1\n for: 0m\n labels:\n severity: critical\n annotations:\n summary: \"Consul agent unhealthy (instance {{ $labels.instance }}).\"\n description: \"A Consul agent is down.\"\n- name: \"Hosts\"\n rules:\n - alert: NodeDown\n expr: up == 0\n for: 0m\n labels:\n severity: critical\n annotations:\n summary: \"Prometheus target missing (instance {{ $labels.instance }}).\"\n description: \"A Prometheus target has disappeared. An exporter might be crashed.\"\n - alert: HostHighCpuLoad\n expr: 100 - (avg by(instance) (rate(node_cpu_seconds_total{mode=\"idle\"}[5m])) * 100) \u003e 95\n for: 0m\n labels:\n severity: warning\n annotations:\n summary: \"Host high CPU load (instance {{ $labels.instance }}).\"\n description: \"CPU load is \u003e 95%.\"\n - alert: HostOutOfMemory\n expr: node_memory_MemAvailable_bytes / node_memory_MemTotal_bytes * 100 \u003c 10\n for: 2m\n labels:\n severity: warning\n annotations:\n summary: \"Host out of memory (instance {{ $labels.instance }}).\"\n description: \"Node memory is filling up (\u003c 10% left).\"\n - alert: HostOomKillDetected\n expr: increase(node_vmstat_oom_kill[1m]) \u003e 0\n for: 0m\n labels:\n severity: warning\n annotations:\n summary: \"Host OOM kill detected (instance {{ $labels.instance }}).\"\n description: \"OOM kill detected.\"\n - alert: HostMemoryUnderMemoryPressure\n expr: rate(node_vmstat_pgmajfault[1m]) \u003e 1000\n for: 2m\n labels:\n severity: warning\n annotations:\n summary: \"Host memory under memory pressure (instance {{ $labels.instance }}).\"\n description: \"The node is under heavy memory pressure. High rate of major page faults.\"\n - alert: HostOutOfDiskSpace\n expr: (node_filesystem_avail_bytes * 100) / node_filesystem_size_bytes \u003c 10 and ON (instance, device, mountpoint) node_filesystem_readonly == 0\n for: 2m\n labels:\n severity: warning\n annotations:\n summary: \"Host out of disk space (instance {{ $labels.instance }}).\"\n description: \"Disk is almost full (\u003c 10% left).\"\n - alert: HostRaidDiskFailure\n expr: node_md_disks{state=\"failed\"} \u003e 0\n for: 2m\n labels:\n severity: warning\n annotations:\n summary: \"Host RAID disk failure (instance {{ $labels.instance }}).\"\n description: \"At least one device in RAID array on {{ $labels.instance }} failed. Array {{ $labels.md_device }} needs attention and possibly a disk swap.\"\n - alert: HostConntrackLimit\n expr: node_nf_conntrack_entries / node_nf_conntrack_entries_limit \u003e 0.8\n for: 5m\n labels:\n severity: warning\n annotations:\n summary: \"Host conntrack limit (instance {{ $labels.instance }}).\"\n description: \"The number of conntrack is approching limit.\"\n - alert: HostNetworkInterfaceSaturated\n expr: (rate(node_network_receive_bytes_total{device!~\"^tap.*\"}[1m]) + rate(node_network_transmit_bytes_total{device!~\"^tap.*\"}[1m])) / node_network_speed_bytes{device!~\"^tap.*\"} \u003e 0.8\n for: 1m\n labels:\n severity: warning\n annotations:\n summary: \"Host Network Interface Saturated (instance {{ $labels.instance }}).\"\n description: \"The network interface {{ $labels.interface }} on {{ $labels.instance }} is getting overloaded.\"\n - alert: HostSystemdServiceCrashed\n expr: node_systemd_unit_state{state=\"failed\"} == 1\n for: 0m\n labels:\n severity: warning\n annotations:\n summary: \"Host SystemD service crashed (instance {{ $labels.instance }}).\"\n description: \"SystemD service crashed.\"\n - alert: HostEdacCorrectableErrorsDetected\n expr: increase(node_edac_correctable_errors_total[1m]) \u003e 0\n for: 0m\n labels:\n severity: info\n annotations:\n summary: \"Host EDAC Correctable Errors detected (instance {{ $labels.instance }}).\"\n description: '{{ $labels.instance }} has had {{ printf \"%.0f\" $value }} correctable memory errors reported by EDAC in the last 5 minutes.'\n - alert: HostEdacUncorrectableErrorsDetected\n expr: node_edac_uncorrectable_errors_total \u003e 0\n for: 0m\n labels:\n severity: warning\n annotations:\n summary: \"Host EDAC Uncorrectable Errors detected (instance {{ $labels.instance }}).\"\n description: '{{ $labels.instance }} has had {{ printf \"%.0f\" $value }} uncorrectable memory errors reported by EDAC in the last 5 minutes.'\n- name: \"Min.io\"\n rules:\n - alert: MinioDiskOffline\n expr: minio_offline_disks \u003e 0\n for: 0m\n labels:\n severity: critical\n annotations:\n summary: \"Minio disk offline (instance {{ $labels.instance }})\"\n description: \"Minio disk is offline.\"\n - alert: MinioStorageSpaceExhausted\n expr: minio_disk_storage_free_bytes / 1024 / 1024 / 1024 \u003c 10\n for: 2m\n labels:\n severity: warning\n annotations:\n summary: \"Minio storage space exhausted (instance {{ $labels.instance }}).\"\n description: \"Minio storage space is low (\u003c 10 GB).\"\n- name: \"Prometheus\"\n rules:\n - alert: PrometheusConfigurationReloadFailure\n expr: prometheus_config_last_reload_successful != 1\n for: 0m\n labels:\n severity: warning\n annotations:\n summary: \"Prometheus configuration reload failure (instance {{ $labels.instance }}).\"\n description: \"Prometheus configuration reload error.\"\n - alert: PrometheusTooManyRestarts\n expr: changes(process_start_time_seconds{job=~\"prometheus|pushgateway|alertmanager\"}[15m]) \u003e 2\n for: 0m\n labels:\n severity: warning\n annotations:\n summary: \"Prometheus too many restarts (instance {{ $labels.instance }}).\"\n description: \"Prometheus has restarted more than twice in the last 15 minutes. It might be crashlooping.\"\n - alert: PrometheusAlertmanagerConfigurationReloadFailure\n expr: alertmanager_config_last_reload_successful != 1\n for: 0m\n labels:\n severity: warning\n annotations:\n summary: \"Prometheus AlertManager configuration reload failure (instance {{ $labels.instance }}).\"\n description: \"AlertManager configuration reload error.\"\n - alert: PrometheusRuleEvaluationFailures\n expr: increase(prometheus_rule_evaluation_failures_total[3m]) \u003e 0\n for: 0m\n labels:\n severity: critical\n annotations:\n summary: \"Prometheus rule evaluation failures (instance {{ $labels.instance }}).\"\n description: \"Prometheus encountered {{ $value }} rule evaluation failures, leading to potentially ignored alerts.\"\n - alert: PrometheusTargetScrapingSlow\n expr: prometheus_target_interval_length_seconds{quantile=\"0.9\"} \u003e 60\n for: 5m\n labels:\n severity: warning\n annotations:\n summary: \"Prometheus target scraping slow (instance {{ $labels.instance }}).\"\n description: \"Prometheus is scraping exporters slowly.\"\n - alert: PrometheusTsdbCompactionsFailed\n expr: increase(prometheus_tsdb_compactions_failed_total[1m]) \u003e 0\n for: 0m\n labels:\n severity: critical\n annotations:\n summary: \"Prometheus TSDB compactions failed (instance {{ $labels.instance }}).\"\n description: \"Prometheus encountered {{ $value }} TSDB compactions failures.\"\n - alert: PrometheusTsdbHeadTruncationsFailed\n expr: increase(prometheus_tsdb_head_truncations_failed_total[1m]) \u003e 0\n for: 0m\n labels:\n severity: critical\n annotations:\n summary: \"Prometheus TSDB head truncations failed (instance {{ $labels.instance }}).\"\n description: \"Prometheus encountered {{ $value }} TSDB head truncation failures.\"\n - alert: PrometheusTsdbWalCorruptions\n expr: increase(prometheus_tsdb_wal_corruptions_total[1m]) \u003e 0\n for: 0m\n labels:\n severity: critical\n annotations:\n summary: \"Prometheus TSDB WAL corruptions (instance {{ $labels.instance }}).\"\n description: \"Prometheus encountered {{ $value }} TSDB WAL corruptions.\"\n - alert: PrometheusTsdbWalTruncationsFailed\n expr: increase(prometheus_tsdb_wal_truncations_failed_total[1m]) \u003e 0\n for: 0m\n labels:\n severity: critical\n annotations:\n summary: \"Prometheus TSDB WAL truncations failed (instance {{ $labels.instance }}).\"\n description: \"Prometheus encountered {{ $value }} TSDB WAL truncation failures.\"\nEOH\n }\n\n template {\n change_mode = \"noop\"\n change_signal = \"SIGINT\"\n destination = \"secrets/prometheus.yml\"\n data = \u003c\u003cEOH\n---\nglobal:\n scrape_interval: 5s\n scrape_timeout: 5s\n evaluation_interval: 5s\n\nalerting:\n alertmanagers:\n - consul_sd_configs:\n - server: '{{ env \"NOMAD_IP_prometheus\" }}:8500'\n services: [ 'alertmanager' ]\n\nrule_files:\n - 'alerts.yml'\n\nscrape_configs:\n\n - job_name: 'Nomad Cluster'\n consul_sd_configs:\n - server: '{{ env \"NOMAD_IP_prometheus\" }}:8500'\n services: [ 'nomad-client', 'nomad' ]\n relabel_configs:\n - source_labels: [__meta_consul_tags]\n regex: '(.*)http(.*)'\n action: keep\n metrics_path: /v1/metrics\n params:\n format: [ 'prometheus' ]\n\n - job_name: 'Consul Cluster'\n static_configs:\n - targets: [ '10.30.51.28:8500' ]\n - targets: [ '10.30.51.29:8500' ]\n - targets: [ '10.30.51.30:8500' ]\n - targets: [ '10.30.51.32:8500' ]\n - targets: [ '10.30.51.33:8500' ]\n - targets: [ '10.30.51.34:8500' ]\n - targets: [ '10.30.51.35:8500' ]\n - targets: [ '10.30.51.39:8500' ]\n - targets: [ '10.30.51.40:8500' ]\n - targets: [ '10.30.51.50:8500' ]\n - targets: [ '10.30.51.51:8500' ]\n - targets: [ '10.30.51.65:8500' ]\n - targets: [ '10.30.51.66:8500' ]\n - targets: [ '10.30.51.67:8500' ]\n - targets: [ '10.30.51.68:8500' ]\n - targets: [ '10.30.51.70:8500' ]\n - targets: [ '10.30.51.71:8500' ]\n - targets: [ '10.32.8.14:8500' ]\n - targets: [ '10.32.8.15:8500' ]\n - targets: [ '10.32.8.16:8500' ]\n - targets: [ '10.32.8.17:8500' ]\n metrics_path: /v1/agent/metrics\n params:\n format: [ 'prometheus' ]\n\n - job_name: 'Blackbox Exporter (icmp)'\n static_configs:\n - targets: [ 'gerrit.fd.io' ]\n - targets: [ 'jenkins.fd.io' ]\n - targets: [ '10.30.51.32' ]\n params:\n module: [ 'icmp_v4' ]\n relabel_configs:\n - source_labels: [__address__]\n target_label: __param_target\n - source_labels: [__param_target]\n target_label: instance\n - target_label: __address__\n replacement: localhost:9115\n metrics_path: /probe\n\n - job_name: 'Blackbox Exporter (http)'\n static_configs:\n - targets: [ 'gerrit.fd.io' ]\n - targets: [ 'jenkins.fd.io' ]\n params:\n module: [ 'http_2xx' ]\n relabel_configs:\n - source_labels: [__address__]\n target_label: __param_target\n - source_labels: [__param_target]\n target_label: instance\n - target_label: __address__\n replacement: localhost:9115\n metrics_path: /probe\n\n - job_name: 'cAdvisor Exporter'\n static_configs:\n - targets: [ '10.30.51.28:8080' ]\n - targets: [ '10.30.51.29:8080' ]\n - targets: [ '10.30.51.30:8080' ]\n #- targets: [ '10.30.51.32:8080' ]\n - targets: [ '10.30.51.33:8080' ]\n - targets: [ '10.30.51.34:8080' ]\n - targets: [ '10.30.51.35:8080' ]\n - targets: [ '10.30.51.39:8080' ]\n - targets: [ '10.30.51.40:8080' ]\n - targets: [ '10.30.51.50:8080' ]\n - targets: [ '10.30.51.51:8080' ]\n - targets: [ '10.30.51.65:8080' ]\n - targets: [ '10.30.51.66:8080' ]\n - targets: [ '10.30.51.67:8080' ]\n - targets: [ '10.30.51.68:8080' ]\n - targets: [ '10.30.51.70:8080' ]\n - targets: [ '10.30.51.71:8080' ]\n - targets: [ '10.32.8.14:8080' ]\n - targets: [ '10.32.8.15:8080' ]\n - targets: [ '10.32.8.16:8080' ]\n - targets: [ '10.32.8.17:8080' ]\n\n - job_name: 'Node Exporter'\n static_configs:\n - targets: [ '10.30.51.28:9100' ]\n - targets: [ '10.30.51.29:9100' ]\n - targets: [ '10.30.51.30:9100' ]\n - targets: [ '10.30.51.32:9100' ]\n - targets: [ '10.30.51.33:9100' ]\n - targets: [ '10.30.51.34:9100' ]\n - targets: [ '10.30.51.35:9100' ]\n - targets: [ '10.30.51.39:9100' ]\n - targets: [ '10.30.51.40:9100' ]\n - targets: [ '10.30.51.50:9100' ]\n - targets: [ '10.30.51.51:9100' ]\n - targets: [ '10.30.51.65:9100' ]\n - targets: [ '10.30.51.66:9100' ]\n - targets: [ '10.30.51.67:9100' ]\n - targets: [ '10.30.51.68:9100' ]\n - targets: [ '10.30.51.70:9100' ]\n - targets: [ '10.30.51.71:9100' ]\n - targets: [ '10.32.8.14:9100' ]\n - targets: [ '10.32.8.15:9100' ]\n - targets: [ '10.32.8.16:9100' ]\n - targets: [ '10.32.8.17:9100' ]\n\n - job_name: 'Alertmanager'\n consul_sd_configs:\n - server: '{{ env \"NOMAD_IP_prometheus\" }}:8500'\n services: [ 'alertmanager' ]\n\n - job_name: 'Grafana'\n consul_sd_configs:\n - server: '{{ env \"NOMAD_IP_prometheus\" }}:8500'\n services: [ 'grafana' ]\n\n - job_name: 'Prometheus'\n consul_sd_configs:\n - server: '{{ env \"NOMAD_IP_prometheus\" }}:8500'\n services: [ 'prometheus' ]\n\n - job_name: 'Minio'\n bearer_token: eyJhbGciOiJIUzUxMiIsInR5cCI6IkpXVCJ9.eyJleHAiOjQ3NjQ1ODEzMzcsImlzcyI6InByb21ldGhldXMiLCJzdWIiOiJtaW5pbyJ9.oeTw3EIaiFmlDikrHXWiWXMH2vxLfDLkfjEC7G2N3M_keH_xyA_l2ofLLNYtopa_3GCEZnxLQdPuFZrmgpkDWg\n consul_sd_configs:\n - server: '{{ env \"NOMAD_IP_prometheus\" }}:8500'\n services: [ 'storage' ]\n metrics_path: /minio/prometheus/metrics\nEOH\n }\n\n # The service stanza instructs Nomad to register a service with Consul.\n #\n # For more information and examples on the \"task\" stanza, please see\n # the online documentation at:\n #\n # https://www.nomadproject.io/docs/job-specification/service\n #\n service {\n name = \"prometheus\"\n port = \"prometheus\"\n tags = [ \"prometheus${NOMAD_ALLOC_INDEX}\" ]\n check {\n name = \"Prometheus Check Live\"\n type = \"http\"\n path = \"/-/healthy\"\n interval = \"10s\"\n timeout = \"2s\"\n }\n }\n\n # The \"resources\" stanza describes the requirements a task needs to\n # execute. Resource requirements include memory, network, cpu, and more.\n # This ensures the task will execute on a machine that contains enough\n # resource capacity.\n #\n # For more information and examples on the \"resources\" stanza, please see\n # the online documentation at:\n #\n # https://www.nomadproject.io/docs/job-specification/resources\n #\n resources {\n cpu = 2000\n memory = 8192\n # The network stanza specifies the networking requirements for the task\n # group, including the network mode and port allocations. When scheduling\n # jobs in Nomad they are provisioned across your fleet of machines along\n # with other jobs and services. Because you don't know in advance what host\n # your job will be provisioned on, Nomad will provide your tasks with\n # network configuration when they start up.\n #\n # For more information and examples on the \"template\" stanza, please see\n # the online documentation at:\n #\n # https://www.nomadproject.io/docs/job-specification/network\n #\n network {\n port \"prometheus\" {\n static = 9090\n }\n }\n }\n }\n }\n}", + "template": "job \"${job_name}\" {\n # The \"region\" parameter specifies the region in which to execute the job.\n # If omitted, this inherits the default region name of \"global\".\n # region = \"global\"\n #\n # The \"datacenters\" parameter specifies the list of datacenters which should\n # be considered when placing this task. This must be provided.\n datacenters = \"${datacenters}\"\n\n # The \"type\" parameter controls the type of job, which impacts the scheduler's\n # decision on placement. This configuration is optional and defaults to\n # \"service\". For a full list of job types and their differences, please see\n # the online documentation.\n #\n # For more information, please see the online documentation at:\n #\n # https://www.nomadproject.io/docs/jobspec/schedulers\n #\n type = \"service\"\n\n update {\n # The \"max_parallel\" parameter specifies the maximum number of updates to\n # perform in parallel. In this case, this specifies to update a single task\n # at a time.\n max_parallel = 1\n\n health_check = \"checks\"\n\n # The \"min_healthy_time\" parameter specifies the minimum time the allocation\n # must be in the healthy state before it is marked as healthy and unblocks\n # further allocations from being updated.\n min_healthy_time = \"10s\"\n\n # The \"healthy_deadline\" parameter specifies the deadline in which the\n # allocation must be marked as healthy after which the allocation is\n # automatically transitioned to unhealthy. Transitioning to unhealthy will\n # fail the deployment and potentially roll back the job if \"auto_revert\" is\n # set to true.\n healthy_deadline = \"3m\"\n\n # The \"progress_deadline\" parameter specifies the deadline in which an\n # allocation must be marked as healthy. The deadline begins when the first\n # allocation for the deployment is created and is reset whenever an allocation\n # as part of the deployment transitions to a healthy state. If no allocation\n # transitions to the healthy state before the progress deadline, the\n # deployment is marked as failed.\n progress_deadline = \"10m\"\n\n%{ if use_canary }\n # The \"canary\" parameter specifies that changes to the job that would result\n # in destructive updates should create the specified number of canaries\n # without stopping any previous allocations. Once the operator determines the\n # canaries are healthy, they can be promoted which unblocks a rolling update\n # of the remaining allocations at a rate of \"max_parallel\".\n #\n # Further, setting \"canary\" equal to the count of the task group allows\n # blue/green deployments. When the job is updated, a full set of the new\n # version is deployed and upon promotion the old version is stopped.\n canary = 1\n\n # Specifies if the job should auto-promote to the canary version when all\n # canaries become healthy during a deployment. Defaults to false which means\n # canaries must be manually updated with the nomad deployment promote\n # command.\n auto_promote = true\n\n # The \"auto_revert\" parameter specifies if the job should auto-revert to the\n # last stable job on deployment failure. A job is marked as stable if all the\n # allocations as part of its deployment were marked healthy.\n auto_revert = true\n%{ endif }\n }\n\n # The \"group\" stanza defines a series of tasks that should be co-located on\n # the same Nomad client. Any task within a group will be placed on the same\n # client.\n #\n # For more information and examples on the \"group\" stanza, please see\n # the online documentation at:\n #\n # https://www.nomadproject.io/docs/job-specification/group\n #\n group \"prod-group1-${service_name}\" {\n # The \"count\" parameter specifies the number of the task groups that should\n # be running under this group. This value must be non-negative and defaults\n # to 1.\n count = ${group_count}\n\n # The volume stanza allows the group to specify that it requires a given\n # volume from the cluster.\n #\n # For more information and examples on the \"volume\" stanza, please see\n # the online documentation at:\n #\n # https://www.nomadproject.io/docs/job-specification/volume\n #\n %{ if use_host_volume }\n volume \"prod-volume1-${service_name}\" {\n type = \"host\"\n read_only = false\n source = \"${host_volume}\"\n }\n %{ endif }\n\n # The constraint allows restricting the set of eligible nodes. Constraints\n # may filter on attributes or client metadata.\n #\n # For more information and examples on the \"volume\" stanza, please see\n # the online documentation at:\n #\n # https://www.nomadproject.io/docs/job-specification/constraint\n #\n constraint {\n attribute = \"$${attr.cpu.arch}\"\n operator = \"!=\"\n value = \"arm64\"\n }\n\n # The \"task\" stanza creates an individual unit of work, such as a Docker\n # container, web application, or batch processing.\n #\n # For more information and examples on the \"task\" stanza, please see\n # the online documentation at:\n #\n # https://www.nomadproject.io/docs/job-specification/task\n #\n task \"prod-task1-${service_name}\" {\n # The \"driver\" parameter specifies the task driver that should be used to\n # run the task.\n driver = \"exec\"\n\n %{ if use_host_volume }\n volume_mount {\n volume = \"prod-volume1-${service_name}\"\n destination = \"${data_dir}\"\n read_only = false\n }\n %{ endif }\n\n %{ if use_vault_provider }\n vault {\n policies = \"${vault_kv_policy_name}\"\n }\n %{ endif }\n\n # The \"config\" stanza specifies the driver configuration, which is passed\n # directly to the driver to start the task. The details of configurations\n # are specific to each driver, so please see specific driver\n # documentation for more information.\n config {\n command = \"local/prometheus-${version}.linux-amd64/prometheus\"\n args = [\n \"--config.file=secrets/prometheus.yml\",\n \"--storage.tsdb.path=${data_dir}prometheus/\",\n \"--storage.tsdb.retention.time=15d\"\n ]\n }\n\n # The artifact stanza instructs Nomad to fetch and unpack a remote resource,\n # such as a file, tarball, or binary. Nomad downloads artifacts using the\n # popular go-getter library, which permits downloading artifacts from a\n # variety of locations using a URL as the input source.\n #\n # For more information and examples on the \"artifact\" stanza, please see\n # the online documentation at:\n #\n # https://www.nomadproject.io/docs/job-specification/artifact\n #\n artifact {\n source = \"${url}\"\n }\n\n # The \"template\" stanza instructs Nomad to manage a template, such as\n # a configuration file or script. This template can optionally pull data\n # from Consul or Vault to populate runtime configuration data.\n #\n # For more information and examples on the \"template\" stanza, please see\n # the online documentation at:\n #\n # https://www.nomadproject.io/docs/job-specification/template\n #\n template {\n change_mode = \"noop\"\n change_signal = \"SIGINT\"\n destination = \"secrets/alerts.yml\"\n left_delimiter = \"{{{\"\n right_delimiter = \"}}}\"\n data = \u003c\u003cEOH\n---\ngroups:\n- name: \"Consul\"\n rules:\n - alert: ConsulServiceHealthcheckFailed\n expr: consul_catalog_service_node_healthy == 0\n for: 0m\n labels:\n severity: critical\n annotations:\n summary: \"Consul service healthcheck failed (instance {{ $labels.instance }}).\"\n description: \"Service: `{{ $labels.service_name }}` Healthcheck: `{{ $labels.service_id }}`.\"\n - alert: ConsulMissingMasterNode\n expr: consul_raft_peers \u003c 3\n for: 0m\n labels:\n severity: critical\n annotations:\n summary: \"Consul missing master node (instance {{ $labels.instance }}).\"\n description: \"Numbers of consul raft peers should be 3, in order to preserve quorum.\"\n - alert: ConsulAgentUnhealthy\n expr: consul_health_node_status{status=\"critical\"} == 1\n for: 0m\n labels:\n severity: critical\n annotations:\n summary: \"Consul agent unhealthy (instance {{ $labels.instance }}).\"\n description: \"A Consul agent is down.\"\n- name: \"Hosts\"\n rules:\n - alert: NodeDown\n expr: up == 0\n for: 0m\n labels:\n severity: critical\n annotations:\n summary: \"Prometheus target missing (instance {{ $labels.instance }}).\"\n description: \"A Prometheus target has disappeared. An exporter might be crashed.\"\n - alert: HostHighCpuLoad\n expr: 100 - (avg by(instance) (rate(node_cpu_seconds_total{mode=\"idle\"}[5m])) * 100) \u003e 95\n for: 0m\n labels:\n severity: warning\n annotations:\n summary: \"Host high CPU load (instance {{ $labels.instance }}).\"\n description: \"CPU load is \u003e 95%.\"\n - alert: HostOutOfMemory\n expr: node_memory_MemAvailable_bytes / node_memory_MemTotal_bytes * 100 \u003c 10\n for: 2m\n labels:\n severity: warning\n annotations:\n summary: \"Host out of memory (instance {{ $labels.instance }}).\"\n description: \"Node memory is filling up (\u003c 10% left).\"\n - alert: HostOomKillDetected\n expr: increase(node_vmstat_oom_kill[1m]) \u003e 0\n for: 0m\n labels:\n severity: warning\n annotations:\n summary: \"Host OOM kill detected (instance {{ $labels.instance }}).\"\n description: \"OOM kill detected.\"\n - alert: HostMemoryUnderMemoryPressure\n expr: rate(node_vmstat_pgmajfault[1m]) \u003e 1000\n for: 2m\n labels:\n severity: warning\n annotations:\n summary: \"Host memory under memory pressure (instance {{ $labels.instance }}).\"\n description: \"The node is under heavy memory pressure. High rate of major page faults.\"\n - alert: HostOutOfDiskSpace\n expr: (node_filesystem_avail_bytes * 100) / node_filesystem_size_bytes \u003c 10 and ON (instance, device, mountpoint) node_filesystem_readonly == 0\n for: 2m\n labels:\n severity: warning\n annotations:\n summary: \"Host out of disk space (instance {{ $labels.instance }}).\"\n description: \"Disk is almost full (\u003c 10% left).\"\n - alert: HostRaidDiskFailure\n expr: node_md_disks{state=\"failed\"} \u003e 0\n for: 2m\n labels:\n severity: warning\n annotations:\n summary: \"Host RAID disk failure (instance {{ $labels.instance }}).\"\n description: \"At least one device in RAID array on {{ $labels.instance }} failed. Array {{ $labels.md_device }} needs attention and possibly a disk swap.\"\n - alert: HostConntrackLimit\n expr: node_nf_conntrack_entries / node_nf_conntrack_entries_limit \u003e 0.8\n for: 5m\n labels:\n severity: warning\n annotations:\n summary: \"Host conntrack limit (instance {{ $labels.instance }}).\"\n description: \"The number of conntrack is approching limit.\"\n - alert: HostNetworkInterfaceSaturated\n expr: (rate(node_network_receive_bytes_total{device!~\"^tap.*\"}[1m]) + rate(node_network_transmit_bytes_total{device!~\"^tap.*\"}[1m])) / node_network_speed_bytes{device!~\"^tap.*\"} \u003e 0.8\n for: 1m\n labels:\n severity: warning\n annotations:\n summary: \"Host Network Interface Saturated (instance {{ $labels.instance }}).\"\n description: \"The network interface {{ $labels.interface }} on {{ $labels.instance }} is getting overloaded.\"\n - alert: HostSystemdServiceCrashed\n expr: node_systemd_unit_state{state=\"failed\"} == 1\n for: 0m\n labels:\n severity: warning\n annotations:\n summary: \"Host SystemD service crashed (instance {{ $labels.instance }}).\"\n description: \"SystemD service crashed.\"\n - alert: HostEdacCorrectableErrorsDetected\n expr: increase(node_edac_correctable_errors_total[1m]) \u003e 0\n for: 0m\n labels:\n severity: info\n annotations:\n summary: \"Host EDAC Correctable Errors detected (instance {{ $labels.instance }}).\"\n description: '{{ $labels.instance }} has had {{ printf \"%.0f\" $value }} correctable memory errors reported by EDAC in the last 5 minutes.'\n - alert: HostEdacUncorrectableErrorsDetected\n expr: node_edac_uncorrectable_errors_total \u003e 0\n for: 0m\n labels:\n severity: warning\n annotations:\n summary: \"Host EDAC Uncorrectable Errors detected (instance {{ $labels.instance }}).\"\n description: '{{ $labels.instance }} has had {{ printf \"%.0f\" $value }} uncorrectable memory errors reported by EDAC in the last 5 minutes.'\n- name: \"Min.io\"\n rules:\n - alert: MinioDiskOffline\n expr: minio_offline_disks \u003e 0\n for: 0m\n labels:\n severity: critical\n annotations:\n summary: \"Minio disk offline (instance {{ $labels.instance }})\"\n description: \"Minio disk is offline.\"\n - alert: MinioStorageSpaceExhausted\n expr: minio_disk_storage_free_bytes / 1024 / 1024 / 1024 \u003c 10\n for: 2m\n labels:\n severity: warning\n annotations:\n summary: \"Minio storage space exhausted (instance {{ $labels.instance }}).\"\n description: \"Minio storage space is low (\u003c 10 GB).\"\n- name: \"Prometheus\"\n rules:\n - alert: PrometheusConfigurationReloadFailure\n expr: prometheus_config_last_reload_successful != 1\n for: 0m\n labels:\n severity: warning\n annotations:\n summary: \"Prometheus configuration reload failure (instance {{ $labels.instance }}).\"\n description: \"Prometheus configuration reload error.\"\n - alert: PrometheusTooManyRestarts\n expr: changes(process_start_time_seconds{job=~\"prometheus|pushgateway|alertmanager\"}[15m]) \u003e 2\n for: 0m\n labels:\n severity: warning\n annotations:\n summary: \"Prometheus too many restarts (instance {{ $labels.instance }}).\"\n description: \"Prometheus has restarted more than twice in the last 15 minutes. It might be crashlooping.\"\n - alert: PrometheusAlertmanagerConfigurationReloadFailure\n expr: alertmanager_config_last_reload_successful != 1\n for: 0m\n labels:\n severity: warning\n annotations:\n summary: \"Prometheus AlertManager configuration reload failure (instance {{ $labels.instance }}).\"\n description: \"AlertManager configuration reload error.\"\n - alert: PrometheusRuleEvaluationFailures\n expr: increase(prometheus_rule_evaluation_failures_total[3m]) \u003e 0\n for: 0m\n labels:\n severity: critical\n annotations:\n summary: \"Prometheus rule evaluation failures (instance {{ $labels.instance }}).\"\n description: \"Prometheus encountered {{ $value }} rule evaluation failures, leading to potentially ignored alerts.\"\n - alert: PrometheusTargetScrapingSlow\n expr: prometheus_target_interval_length_seconds{quantile=\"0.9\"} \u003e 60\n for: 5m\n labels:\n severity: warning\n annotations:\n summary: \"Prometheus target scraping slow (instance {{ $labels.instance }}).\"\n description: \"Prometheus is scraping exporters slowly.\"\n - alert: PrometheusTsdbCompactionsFailed\n expr: increase(prometheus_tsdb_compactions_failed_total[1m]) \u003e 0\n for: 0m\n labels:\n severity: critical\n annotations:\n summary: \"Prometheus TSDB compactions failed (instance {{ $labels.instance }}).\"\n description: \"Prometheus encountered {{ $value }} TSDB compactions failures.\"\n - alert: PrometheusTsdbHeadTruncationsFailed\n expr: increase(prometheus_tsdb_head_truncations_failed_total[1m]) \u003e 0\n for: 0m\n labels:\n severity: critical\n annotations:\n summary: \"Prometheus TSDB head truncations failed (instance {{ $labels.instance }}).\"\n description: \"Prometheus encountered {{ $value }} TSDB head truncation failures.\"\n - alert: PrometheusTsdbWalCorruptions\n expr: increase(prometheus_tsdb_wal_corruptions_total[1m]) \u003e 0\n for: 0m\n labels:\n severity: critical\n annotations:\n summary: \"Prometheus TSDB WAL corruptions (instance {{ $labels.instance }}).\"\n description: \"Prometheus encountered {{ $value }} TSDB WAL corruptions.\"\n - alert: PrometheusTsdbWalTruncationsFailed\n expr: increase(prometheus_tsdb_wal_truncations_failed_total[1m]) \u003e 0\n for: 0m\n labels:\n severity: critical\n annotations:\n summary: \"Prometheus TSDB WAL truncations failed (instance {{ $labels.instance }}).\"\n description: \"Prometheus encountered {{ $value }} TSDB WAL truncation failures.\"\nEOH\n }\n\n template {\n change_mode = \"noop\"\n change_signal = \"SIGINT\"\n destination = \"secrets/prometheus.yml\"\n data = \u003c\u003cEOH\n---\nglobal:\n scrape_interval: 5s\n scrape_timeout: 5s\n evaluation_interval: 5s\n\nalerting:\n alertmanagers:\n - consul_sd_configs:\n - server: '{{ env \"NOMAD_IP_prometheus\" }}:8500'\n services: [ 'alertmanager' ]\n\nrule_files:\n - 'alerts.yml'\n\nscrape_configs:\n\n - job_name: 'Nomad Cluster'\n consul_sd_configs:\n - server: '{{ env \"NOMAD_IP_prometheus\" }}:8500'\n services: [ 'nomad-client', 'nomad' ]\n relabel_configs:\n - source_labels: [__meta_consul_tags]\n regex: '(.*)http(.*)'\n action: keep\n metrics_path: /v1/metrics\n params:\n format: [ 'prometheus' ]\n\n - job_name: 'Consul Cluster'\n static_configs:\n - targets: [ '10.30.51.28:8500' ]\n - targets: [ '10.30.51.29:8500' ]\n - targets: [ '10.30.51.30:8500' ]\n - targets: [ '10.30.51.32:8500' ]\n - targets: [ '10.30.51.33:8500' ]\n - targets: [ '10.30.51.34:8500' ]\n - targets: [ '10.30.51.35:8500' ]\n - targets: [ '10.30.51.39:8500' ]\n - targets: [ '10.30.51.40:8500' ]\n - targets: [ '10.30.51.50:8500' ]\n - targets: [ '10.30.51.51:8500' ]\n - targets: [ '10.30.51.65:8500' ]\n - targets: [ '10.30.51.66:8500' ]\n - targets: [ '10.30.51.67:8500' ]\n - targets: [ '10.30.51.68:8500' ]\n - targets: [ '10.30.51.70:8500' ]\n - targets: [ '10.30.51.71:8500' ]\n - targets: [ '10.32.8.14:8500' ]\n - targets: [ '10.32.8.15:8500' ]\n - targets: [ '10.32.8.16:8500' ]\n - targets: [ '10.32.8.17:8500' ]\n metrics_path: /v1/agent/metrics\n params:\n format: [ 'prometheus' ]\n\n - job_name: 'Blackbox Exporter (icmp)'\n static_configs:\n - targets: [ 'gerrit.fd.io' ]\n - targets: [ 'jenkins.fd.io' ]\n - targets: [ '10.30.51.32' ]\n params:\n module: [ 'icmp_v4' ]\n relabel_configs:\n - source_labels: [__address__]\n target_label: __param_target\n - source_labels: [__param_target]\n target_label: instance\n - target_label: __address__\n replacement: localhost:9115\n metrics_path: /probe\n\n - job_name: 'Blackbox Exporter (http)'\n static_configs:\n - targets: [ 'gerrit.fd.io' ]\n - targets: [ 'jenkins.fd.io' ]\n params:\n module: [ 'http_2xx' ]\n relabel_configs:\n - source_labels: [__address__]\n target_label: __param_target\n - source_labels: [__param_target]\n target_label: instance\n - target_label: __address__\n replacement: localhost:9115\n metrics_path: /probe\n\n - job_name: 'cAdvisor Exporter'\n static_configs:\n - targets: [ '10.30.51.28:8080' ]\n - targets: [ '10.30.51.29:8080' ]\n - targets: [ '10.30.51.30:8080' ]\n #- targets: [ '10.30.51.32:8080' ]\n - targets: [ '10.30.51.33:8080' ]\n - targets: [ '10.30.51.34:8080' ]\n - targets: [ '10.30.51.35:8080' ]\n - targets: [ '10.30.51.39:8080' ]\n - targets: [ '10.30.51.40:8080' ]\n - targets: [ '10.30.51.50:8080' ]\n - targets: [ '10.30.51.51:8080' ]\n - targets: [ '10.30.51.65:8080' ]\n - targets: [ '10.30.51.66:8080' ]\n - targets: [ '10.30.51.67:8080' ]\n - targets: [ '10.30.51.68:8080' ]\n - targets: [ '10.30.51.70:8080' ]\n - targets: [ '10.30.51.71:8080' ]\n - targets: [ '10.32.8.14:8080' ]\n - targets: [ '10.32.8.15:8080' ]\n - targets: [ '10.32.8.16:8080' ]\n - targets: [ '10.32.8.17:8080' ]\n\n - job_name: 'Node Exporter'\n static_configs:\n - targets: [ '10.30.51.28:9100' ]\n - targets: [ '10.30.51.29:9100' ]\n - targets: [ '10.30.51.30:9100' ]\n - targets: [ '10.30.51.32:9100' ]\n - targets: [ '10.30.51.33:9100' ]\n - targets: [ '10.30.51.34:9100' ]\n - targets: [ '10.30.51.35:9100' ]\n - targets: [ '10.30.51.39:9100' ]\n - targets: [ '10.30.51.40:9100' ]\n - targets: [ '10.30.51.50:9100' ]\n - targets: [ '10.30.51.51:9100' ]\n - targets: [ '10.30.51.65:9100' ]\n - targets: [ '10.30.51.66:9100' ]\n - targets: [ '10.30.51.67:9100' ]\n - targets: [ '10.30.51.68:9100' ]\n - targets: [ '10.30.51.70:9100' ]\n - targets: [ '10.30.51.71:9100' ]\n - targets: [ '10.32.8.14:9100' ]\n - targets: [ '10.32.8.15:9100' ]\n - targets: [ '10.32.8.16:9100' ]\n - targets: [ '10.32.8.17:9100' ]\n\n - job_name: 'Alertmanager'\n consul_sd_configs:\n - server: '{{ env \"NOMAD_IP_prometheus\" }}:8500'\n services: [ 'alertmanager' ]\n\n - job_name: 'Grafana'\n consul_sd_configs:\n - server: '{{ env \"NOMAD_IP_prometheus\" }}:8500'\n services: [ 'grafana' ]\n\n - job_name: 'Prometheus'\n consul_sd_configs:\n - server: '{{ env \"NOMAD_IP_prometheus\" }}:8500'\n services: [ 'prometheus' ]\n\n - job_name: 'Minio'\n bearer_token: eyJhbGciOiJIUzUxMiIsInR5cCI6IkpXVCJ9.eyJleHAiOjQ3NjQ1ODEzMzcsImlzcyI6InByb21ldGhldXMiLCJzdWIiOiJtaW5pbyJ9.oeTw3EIaiFmlDikrHXWiWXMH2vxLfDLkfjEC7G2N3M_keH_xyA_l2ofLLNYtopa_3GCEZnxLQdPuFZrmgpkDWg\n consul_sd_configs:\n - server: '{{ env \"NOMAD_IP_prometheus\" }}:8500'\n services: [ 'storage' ]\n metrics_path: /minio/prometheus/metrics\nEOH\n }\n\n # The service stanza instructs Nomad to register a service with Consul.\n #\n # For more information and examples on the \"task\" stanza, please see\n # the online documentation at:\n #\n # https://www.nomadproject.io/docs/job-specification/service\n #\n service {\n name = \"${service_name}\"\n port = \"${service_name}\"\n tags = [ \"${service_name}$${NOMAD_ALLOC_INDEX}\" ]\n check {\n name = \"Prometheus Check Live\"\n type = \"http\"\n path = \"/-/healthy\"\n interval = \"10s\"\n timeout = \"2s\"\n }\n }\n\n # The \"resources\" stanza describes the requirements a task needs to\n # execute. Resource requirements include memory, network, cpu, and more.\n # This ensures the task will execute on a machine that contains enough\n # resource capacity.\n #\n # For more information and examples on the \"resources\" stanza, please see\n # the online documentation at:\n #\n # https://www.nomadproject.io/docs/job-specification/resources\n #\n resources {\n cpu = ${cpu}\n memory = ${mem}\n # The network stanza specifies the networking requirements for the task\n # group, including the network mode and port allocations. When scheduling\n # jobs in Nomad they are provisioned across your fleet of machines along\n # with other jobs and services. Because you don't know in advance what host\n # your job will be provisioned on, Nomad will provide your tasks with\n # network configuration when they start up.\n #\n # For more information and examples on the \"template\" stanza, please see\n # the online documentation at:\n #\n # https://www.nomadproject.io/docs/job-specification/network\n #\n network {\n port \"${service_name}\" {\n static = ${port}\n }\n }\n }\n }\n }\n}", "vars": { "cpu": "2000", "data_dir": "/data/", @@ -661,12 +614,7 @@ { "schema_version": 0, "attributes": { - "allocation_ids": [ - "79c4c6b7-e8e4-79e7-9dae-9a26142cd94c", - "2e11f44e-814e-db74-a9ee-8bcbebb1fa0b", - "83f9c9aa-f4f2-b832-a937-aaee4181493e", - "231f1338-a871-b44e-d545-e8af01d7eebe" - ], + "allocation_ids": null, "datacenters": [ "yul1" ], @@ -676,9 +624,9 @@ "deregister_on_id_change": true, "detach": false, "id": "prod-prometheus", - "jobspec": "job \"prod-prometheus\" {\n # The \"region\" parameter specifies the region in which to execute the job.\n # If omitted, this inherits the default region name of \"global\".\n # region = \"global\"\n #\n # The \"datacenters\" parameter specifies the list of datacenters which should\n # be considered when placing this task. This must be provided.\n datacenters = \"yul1\"\n\n # The \"type\" parameter controls the type of job, which impacts the scheduler's\n # decision on placement. This configuration is optional and defaults to\n # \"service\". For a full list of job types and their differences, please see\n # the online documentation.\n #\n # For more information, please see the online documentation at:\n #\n # https://www.nomadproject.io/docs/jobspec/schedulers\n #\n type = \"service\"\n\n update {\n # The \"max_parallel\" parameter specifies the maximum number of updates to\n # perform in parallel. In this case, this specifies to update a single task\n # at a time.\n max_parallel = 1\n\n health_check = \"checks\"\n\n # The \"min_healthy_time\" parameter specifies the minimum time the allocation\n # must be in the healthy state before it is marked as healthy and unblocks\n # further allocations from being updated.\n min_healthy_time = \"10s\"\n\n # The \"healthy_deadline\" parameter specifies the deadline in which the\n # allocation must be marked as healthy after which the allocation is\n # automatically transitioned to unhealthy. Transitioning to unhealthy will\n # fail the deployment and potentially roll back the job if \"auto_revert\" is\n # set to true.\n healthy_deadline = \"3m\"\n\n # The \"progress_deadline\" parameter specifies the deadline in which an\n # allocation must be marked as healthy. The deadline begins when the first\n # allocation for the deployment is created and is reset whenever an allocation\n # as part of the deployment transitions to a healthy state. If no allocation\n # transitions to the healthy state before the progress deadline, the\n # deployment is marked as failed.\n progress_deadline = \"10m\"\n\n\n # The \"canary\" parameter specifies that changes to the job that would result\n # in destructive updates should create the specified number of canaries\n # without stopping any previous allocations. Once the operator determines the\n # canaries are healthy, they can be promoted which unblocks a rolling update\n # of the remaining allocations at a rate of \"max_parallel\".\n #\n # Further, setting \"canary\" equal to the count of the task group allows\n # blue/green deployments. When the job is updated, a full set of the new\n # version is deployed and upon promotion the old version is stopped.\n canary = 1\n\n # Specifies if the job should auto-promote to the canary version when all\n # canaries become healthy during a deployment. Defaults to false which means\n # canaries must be manually updated with the nomad deployment promote\n # command.\n auto_promote = true\n\n # The \"auto_revert\" parameter specifies if the job should auto-revert to the\n # last stable job on deployment failure. A job is marked as stable if all the\n # allocations as part of its deployment were marked healthy.\n auto_revert = true\n\n }\n\n # The \"group\" stanza defines a series of tasks that should be co-located on\n # the same Nomad client. Any task within a group will be placed on the same\n # client.\n #\n # For more information and examples on the \"group\" stanza, please see\n # the online documentation at:\n #\n # https://www.nomadproject.io/docs/job-specification/group\n #\n group \"prod-group1-prometheus\" {\n # The \"count\" parameter specifies the number of the task groups that should\n # be running under this group. This value must be non-negative and defaults\n # to 1.\n count = 4\n\n # The volume stanza allows the group to specify that it requires a given\n # volume from the cluster.\n #\n # For more information and examples on the \"volume\" stanza, please see\n # the online documentation at:\n #\n # https://www.nomadproject.io/docs/job-specification/volume\n #\n \n volume \"prod-volume1-prometheus\" {\n type = \"host\"\n read_only = false\n source = \"prod-volume-data1-1\"\n }\n \n\n # The constraint allows restricting the set of eligible nodes. Constraints\n # may filter on attributes or client metadata.\n #\n # For more information and examples on the \"volume\" stanza, please see\n # the online documentation at:\n #\n # https://www.nomadproject.io/docs/job-specification/constraint\n #\n constraint {\n attribute = \"${attr.cpu.arch}\"\n operator = \"!=\"\n value = \"arm64\"\n }\n\n # The \"task\" stanza creates an individual unit of work, such as a Docker\n # container, web application, or batch processing.\n #\n # For more information and examples on the \"task\" stanza, please see\n # the online documentation at:\n #\n # https://www.nomadproject.io/docs/job-specification/task\n #\n task \"prod-task1-prometheus\" {\n # The \"driver\" parameter specifies the task driver that should be used to\n # run the task.\n driver = \"exec\"\n\n \n volume_mount {\n volume = \"prod-volume1-prometheus\"\n destination = \"/data/\"\n read_only = false\n }\n \n\n \n\n # The \"config\" stanza specifies the driver configuration, which is passed\n # directly to the driver to start the task. The details of configurations\n # are specific to each driver, so please see specific driver\n # documentation for more information.\n config {\n command = \"local/prometheus-2.24.0.linux-amd64/prometheus\"\n args = [\n \"--config.file=secrets/prometheus.yml\",\n \"--storage.tsdb.path=/data/prometheus/\",\n \"--storage.tsdb.retention.time=15d\"\n ]\n }\n\n # The artifact stanza instructs Nomad to fetch and unpack a remote resource,\n # such as a file, tarball, or binary. Nomad downloads artifacts using the\n # popular go-getter library, which permits downloading artifacts from a\n # variety of locations using a URL as the input source.\n #\n # For more information and examples on the \"artifact\" stanza, please see\n # the online documentation at:\n #\n # https://www.nomadproject.io/docs/job-specification/artifact\n #\n artifact {\n source = \"https://github.com/prometheus/prometheus/releases/download/v2.24.0/prometheus-2.24.0.linux-amd64.tar.gz\"\n }\n\n # The \"template\" stanza instructs Nomad to manage a template, such as\n # a configuration file or script. This template can optionally pull data\n # from Consul or Vault to populate runtime configuration data.\n #\n # For more information and examples on the \"template\" stanza, please see\n # the online documentation at:\n #\n # https://www.nomadproject.io/docs/job-specification/template\n #\n template {\n change_mode = \"noop\"\n change_signal = \"SIGINT\"\n destination = \"secrets/alerts.yml\"\n left_delimiter = \"{{{\"\n right_delimiter = \"}}}\"\n data = \u003c\u003cEOH\n---\ngroups:\n- name: \"Consul\"\n rules:\n - alert: ConsulServiceHealthcheckFailed\n expr: consul_catalog_service_node_healthy == 0\n for: 0m\n labels:\n severity: critical\n annotations:\n summary: \"Consul service healthcheck failed (instance {{ $labels.instance }}).\"\n description: \"Service: `{{ $labels.service_name }}` Healthcheck: `{{ $labels.service_id }}`.\"\n - alert: ConsulMissingMasterNode\n expr: consul_raft_peers \u003c 3\n for: 0m\n labels:\n severity: critical\n annotations:\n summary: \"Consul missing master node (instance {{ $labels.instance }}).\"\n description: \"Numbers of consul raft peers should be 3, in order to preserve quorum.\"\n - alert: ConsulAgentUnhealthy\n expr: consul_health_node_status{status=\"critical\"} == 1\n for: 0m\n labels:\n severity: critical\n annotations:\n summary: \"Consul agent unhealthy (instance {{ $labels.instance }}).\"\n description: \"A Consul agent is down.\"\n- name: \"Hosts\"\n rules:\n - alert: NodeDown\n expr: up == 0\n for: 0m\n labels:\n severity: critical\n annotations:\n summary: \"Prometheus target missing (instance {{ $labels.instance }}).\"\n description: \"A Prometheus target has disappeared. An exporter might be crashed.\"\n - alert: HostHighCpuLoad\n expr: 100 - (avg by(instance) (rate(node_cpu_seconds_total{mode=\"idle\"}[2m])) * 100) \u003e 80\n for: 0m\n labels:\n severity: warning\n annotations:\n summary: \"Host high CPU load (instance {{ $labels.instance }}).\"\n description: \"CPU load is \u003e 80%.\"\n - alert: HostOutOfMemory\n expr: node_memory_MemAvailable_bytes / node_memory_MemTotal_bytes * 100 \u003c 10\n for: 2m\n labels:\n severity: warning\n annotations:\n summary: \"Host out of memory (instance {{ $labels.instance }}).\"\n description: \"Node memory is filling up (\u003c 10% left).\"\n - alert: HostOomKillDetected\n expr: increase(node_vmstat_oom_kill[1m]) \u003e 0\n for: 0m\n labels:\n severity: warning\n annotations:\n summary: \"Host OOM kill detected (instance {{ $labels.instance }}).\"\n description: \"OOM kill detected.\"\n - alert: HostMemoryUnderMemoryPressure\n expr: rate(node_vmstat_pgmajfault[1m]) \u003e 1000\n for: 2m\n labels:\n severity: warning\n annotations:\n summary: \"Host memory under memory pressure (instance {{ $labels.instance }}).\"\n description: \"The node is under heavy memory pressure. High rate of major page faults.\"\n - alert: HostOutOfDiskSpace\n expr: (node_filesystem_avail_bytes * 100) / node_filesystem_size_bytes \u003c 10 and ON (instance, device, mountpoint) node_filesystem_readonly == 0\n for: 2m\n labels:\n severity: warning\n annotations:\n summary: \"Host out of disk space (instance {{ $labels.instance }}).\"\n description: \"Disk is almost full (\u003c 10% left).\"\n - alert: HostRaidDiskFailure\n expr: node_md_disks{state=\"failed\"} \u003e 0\n for: 2m\n labels:\n severity: warning\n annotations:\n summary: \"Host RAID disk failure (instance {{ $labels.instance }}).\"\n description: \"At least one device in RAID array on {{ $labels.instance }} failed. Array {{ $labels.md_device }} needs attention and possibly a disk swap.\"\n - alert: HostConntrackLimit\n expr: node_nf_conntrack_entries / node_nf_conntrack_entries_limit \u003e 0.8\n for: 5m\n labels:\n severity: warning\n annotations:\n summary: \"Host conntrack limit (instance {{ $labels.instance }}).\"\n description: \"The number of conntrack is approching limit.\"\n - alert: HostNetworkInterfaceSaturated\n expr: (rate(node_network_receive_bytes_total{device!~\"^tap.*\"}[1m]) + rate(node_network_transmit_bytes_total{device!~\"^tap.*\"}[1m])) / node_network_speed_bytes{device!~\"^tap.*\"} \u003e 0.8\n for: 1m\n labels:\n severity: warning\n annotations:\n summary: \"Host Network Interface Saturated (instance {{ $labels.instance }}).\"\n description: \"The network interface {{ $labels.interface }} on {{ $labels.instance }} is getting overloaded.\"\n - alert: HostSystemdServiceCrashed\n expr: node_systemd_unit_state{state=\"failed\"} == 1\n for: 0m\n labels:\n severity: warning\n annotations:\n summary: \"Host SystemD service crashed (instance {{ $labels.instance }}).\"\n description: \"SystemD service crashed.\"\n - alert: HostEdacCorrectableErrorsDetected\n expr: increase(node_edac_correctable_errors_total[1m]) \u003e 0\n for: 0m\n labels:\n severity: info\n annotations:\n summary: \"Host EDAC Correctable Errors detected (instance {{ $labels.instance }}).\"\n description: '{{ $labels.instance }} has had {{ printf \"%.0f\" $value }} correctable memory errors reported by EDAC in the last 5 minutes.'\n - alert: HostEdacUncorrectableErrorsDetected\n expr: node_edac_uncorrectable_errors_total \u003e 0\n for: 0m\n labels:\n severity: warning\n annotations:\n summary: \"Host EDAC Uncorrectable Errors detected (instance {{ $labels.instance }}).\"\n description: '{{ $labels.instance }} has had {{ printf \"%.0f\" $value }} uncorrectable memory errors reported by EDAC in the last 5 minutes.'\n- name: \"Min.io\"\n rules:\n - alert: MinioDiskOffline\n expr: minio_offline_disks \u003e 0\n for: 0m\n labels:\n severity: critical\n annotations:\n summary: \"Minio disk offline (instance {{ $labels.instance }})\"\n description: \"Minio disk is offline.\"\n - alert: MinioStorageSpaceExhausted\n expr: minio_disk_storage_free_bytes / 1024 / 1024 / 1024 \u003c 10\n for: 2m\n labels:\n severity: warning\n annotations:\n summary: \"Minio storage space exhausted (instance {{ $labels.instance }}).\"\n description: \"Minio storage space is low (\u003c 10 GB).\"\n- name: \"Prometheus\"\n rules:\n - alert: PrometheusConfigurationReloadFailure\n expr: prometheus_config_last_reload_successful != 1\n for: 0m\n labels:\n severity: warning\n annotations:\n summary: \"Prometheus configuration reload failure (instance {{ $labels.instance }}).\"\n description: \"Prometheus configuration reload error.\"\n - alert: PrometheusTooManyRestarts\n expr: changes(process_start_time_seconds{job=~\"prometheus|pushgateway|alertmanager\"}[15m]) \u003e 2\n for: 0m\n labels:\n severity: warning\n annotations:\n summary: \"Prometheus too many restarts (instance {{ $labels.instance }}).\"\n description: \"Prometheus has restarted more than twice in the last 15 minutes. It might be crashlooping.\"\n - alert: PrometheusAlertmanagerConfigurationReloadFailure\n expr: alertmanager_config_last_reload_successful != 1\n for: 0m\n labels:\n severity: warning\n annotations:\n summary: \"Prometheus AlertManager configuration reload failure (instance {{ $labels.instance }}).\"\n description: \"AlertManager configuration reload error.\"\n - alert: PrometheusRuleEvaluationFailures\n expr: increase(prometheus_rule_evaluation_failures_total[3m]) \u003e 0\n for: 0m\n labels:\n severity: critical\n annotations:\n summary: \"Prometheus rule evaluation failures (instance {{ $labels.instance }}).\"\n description: \"Prometheus encountered {{ $value }} rule evaluation failures, leading to potentially ignored alerts.\"\n - alert: PrometheusTargetScrapingSlow\n expr: prometheus_target_interval_length_seconds{quantile=\"0.9\"} \u003e 60\n for: 5m\n labels:\n severity: warning\n annotations:\n summary: \"Prometheus target scraping slow (instance {{ $labels.instance }}).\"\n description: \"Prometheus is scraping exporters slowly.\"\n - alert: PrometheusTsdbCompactionsFailed\n expr: increase(prometheus_tsdb_compactions_failed_total[1m]) \u003e 0\n for: 0m\n labels:\n severity: critical\n annotations:\n summary: \"Prometheus TSDB compactions failed (instance {{ $labels.instance }}).\"\n description: \"Prometheus encountered {{ $value }} TSDB compactions failures.\"\n - alert: PrometheusTsdbHeadTruncationsFailed\n expr: increase(prometheus_tsdb_head_truncations_failed_total[1m]) \u003e 0\n for: 0m\n labels:\n severity: critical\n annotations:\n summary: \"Prometheus TSDB head truncations failed (instance {{ $labels.instance }}).\"\n description: \"Prometheus encountered {{ $value }} TSDB head truncation failures.\"\n - alert: PrometheusTsdbWalCorruptions\n expr: increase(prometheus_tsdb_wal_corruptions_total[1m]) \u003e 0\n for: 0m\n labels:\n severity: critical\n annotations:\n summary: \"Prometheus TSDB WAL corruptions (instance {{ $labels.instance }}).\"\n description: \"Prometheus encountered {{ $value }} TSDB WAL corruptions.\"\n - alert: PrometheusTsdbWalTruncationsFailed\n expr: increase(prometheus_tsdb_wal_truncations_failed_total[1m]) \u003e 0\n for: 0m\n labels:\n severity: critical\n annotations:\n summary: \"Prometheus TSDB WAL truncations failed (instance {{ $labels.instance }}).\"\n description: \"Prometheus encountered {{ $value }} TSDB WAL truncation failures.\"\nEOH\n }\n\n template {\n change_mode = \"noop\"\n change_signal = \"SIGINT\"\n destination = \"secrets/prometheus.yml\"\n data = \u003c\u003cEOH\n---\nglobal:\n scrape_interval: 5s\n scrape_timeout: 5s\n evaluation_interval: 5s\n\nalerting:\n alertmanagers:\n - consul_sd_configs:\n - server: '{{ env \"NOMAD_IP_prometheus\" }}:8500'\n services: [ 'alertmanager' ]\n\nrule_files:\n - 'alerts.yml'\n\nscrape_configs:\n\n - job_name: 'Nomad Cluster'\n consul_sd_configs:\n - server: '{{ env \"NOMAD_IP_prometheus\" }}:8500'\n services: [ 'nomad-client', 'nomad' ]\n relabel_configs:\n - source_labels: [__meta_consul_tags]\n regex: '(.*)http(.*)'\n action: keep\n metrics_path: /v1/metrics\n params:\n format: [ 'prometheus' ]\n\n - job_name: 'Consul Cluster'\n static_configs:\n - targets: [ '10.30.51.30:8500', '10.30.51.32:8500', '10.30.51.33:8500' ]\n metrics_path: /v1/agent/metrics\n params:\n format: [ 'prometheus' ]\n\n - job_name: 'Alertmanager'\n consul_sd_configs:\n - server: '{{ env \"NOMAD_IP_prometheus\" }}:8500'\n services: [ 'alertmanager' ]\n\n - job_name: 'Blackbox Exporter (icmp)'\n static_configs:\n - targets: [ 'gerrit.fd.io' ]\n - targets: [ 'jenkins.fd.io' ]\n - targets: [ '10.30.51.32' ]\n params:\n module: [ 'icmp_v4' ]\n relabel_configs:\n - source_labels: [__address__]\n target_label: __param_target\n - source_labels: [__param_target]\n target_label: instance\n - target_label: __address__\n replacement: localhost:9115\n metrics_path: /probe\n\n - job_name: 'Blackbox Exporter (http)'\n static_configs:\n - targets: [ 'gerrit.fd.io' ]\n - targets: [ 'jenkins.fd.io' ]\n params:\n module: [ 'http_2xx' ]\n relabel_configs:\n - source_labels: [__address__]\n target_label: __param_target\n - source_labels: [__param_target]\n target_label: instance\n - target_label: __address__\n replacement: localhost:9115\n metrics_path: /probe\n\n - job_name: 'cAdvisor Exporter'\n consul_sd_configs:\n - server: '{{ env \"NOMAD_IP_prometheus\" }}:8500'\n services: [ 'cadvisorexporter' ]\n\n - job_name: 'Grafana'\n consul_sd_configs:\n - server: '{{ env \"NOMAD_IP_prometheus\" }}:8500'\n services: [ 'grafana' ]\n\n - job_name: 'Node Exporter'\n consul_sd_configs:\n - server: '{{ env \"NOMAD_IP_prometheus\" }}:8500'\n services: [ 'nodeexporter' ]\n\n - job_name: 'Prometheus'\n consul_sd_configs:\n - server: '{{ env \"NOMAD_IP_prometheus\" }}:8500'\n services: [ 'prometheus' ]\n\n - job_name: 'Minio'\n bearer_token: eyJhbGciOiJIUzUxMiIsInR5cCI6IkpXVCJ9.eyJleHAiOjQ3NjQ1ODEzMzcsImlzcyI6InByb21ldGhldXMiLCJzdWIiOiJtaW5pbyJ9.oeTw3EIaiFmlDikrHXWiWXMH2vxLfDLkfjEC7G2N3M_keH_xyA_l2ofLLNYtopa_3GCEZnxLQdPuFZrmgpkDWg\n consul_sd_configs:\n - server: '{{ env \"NOMAD_IP_prometheus\" }}:8500'\n services: [ 'storage' ]\n metrics_path: /minio/prometheus/metrics\nEOH\n }\n\n # The service stanza instructs Nomad to register a service with Consul.\n #\n # For more information and examples on the \"task\" stanza, please see\n # the online documentation at:\n #\n # https://www.nomadproject.io/docs/job-specification/service\n #\n service {\n name = \"prometheus\"\n port = \"prometheus\"\n tags = [ \"prometheus${NOMAD_ALLOC_INDEX}\" ]\n check {\n name = \"Prometheus Check Live\"\n type = \"http\"\n path = \"/-/healthy\"\n interval = \"10s\"\n timeout = \"2s\"\n }\n }\n\n # The \"resources\" stanza describes the requirements a task needs to\n # execute. Resource requirements include memory, network, cpu, and more.\n # This ensures the task will execute on a machine that contains enough\n # resource capacity.\n #\n # For more information and examples on the \"resources\" stanza, please see\n # the online documentation at:\n #\n # https://www.nomadproject.io/docs/job-specification/resources\n #\n resources {\n cpu = 2000\n memory = 8192\n # The network stanza specifies the networking requirements for the task\n # group, including the network mode and port allocations. When scheduling\n # jobs in Nomad they are provisioned across your fleet of machines along\n # with other jobs and services. Because you don't know in advance what host\n # your job will be provisioned on, Nomad will provide your tasks with\n # network configuration when they start up.\n #\n # For more information and examples on the \"template\" stanza, please see\n # the online documentation at:\n #\n # https://www.nomadproject.io/docs/job-specification/network\n #\n network {\n port \"prometheus\" {\n static = 9090\n }\n }\n }\n }\n }\n}", + "jobspec": "job \"prod-prometheus\" {\n # The \"region\" parameter specifies the region in which to execute the job.\n # If omitted, this inherits the default region name of \"global\".\n # region = \"global\"\n #\n # The \"datacenters\" parameter specifies the list of datacenters which should\n # be considered when placing this task. This must be provided.\n datacenters = \"yul1\"\n\n # The \"type\" parameter controls the type of job, which impacts the scheduler's\n # decision on placement. This configuration is optional and defaults to\n # \"service\". For a full list of job types and their differences, please see\n # the online documentation.\n #\n # For more information, please see the online documentation at:\n #\n # https://www.nomadproject.io/docs/jobspec/schedulers\n #\n type = \"service\"\n\n update {\n # The \"max_parallel\" parameter specifies the maximum number of updates to\n # perform in parallel. In this case, this specifies to update a single task\n # at a time.\n max_parallel = 1\n\n health_check = \"checks\"\n\n # The \"min_healthy_time\" parameter specifies the minimum time the allocation\n # must be in the healthy state before it is marked as healthy and unblocks\n # further allocations from being updated.\n min_healthy_time = \"10s\"\n\n # The \"healthy_deadline\" parameter specifies the deadline in which the\n # allocation must be marked as healthy after which the allocation is\n # automatically transitioned to unhealthy. Transitioning to unhealthy will\n # fail the deployment and potentially roll back the job if \"auto_revert\" is\n # set to true.\n healthy_deadline = \"3m\"\n\n # The \"progress_deadline\" parameter specifies the deadline in which an\n # allocation must be marked as healthy. The deadline begins when the first\n # allocation for the deployment is created and is reset whenever an allocation\n # as part of the deployment transitions to a healthy state. If no allocation\n # transitions to the healthy state before the progress deadline, the\n # deployment is marked as failed.\n progress_deadline = \"10m\"\n\n\n # The \"canary\" parameter specifies that changes to the job that would result\n # in destructive updates should create the specified number of canaries\n # without stopping any previous allocations. Once the operator determines the\n # canaries are healthy, they can be promoted which unblocks a rolling update\n # of the remaining allocations at a rate of \"max_parallel\".\n #\n # Further, setting \"canary\" equal to the count of the task group allows\n # blue/green deployments. When the job is updated, a full set of the new\n # version is deployed and upon promotion the old version is stopped.\n canary = 1\n\n # Specifies if the job should auto-promote to the canary version when all\n # canaries become healthy during a deployment. Defaults to false which means\n # canaries must be manually updated with the nomad deployment promote\n # command.\n auto_promote = true\n\n # The \"auto_revert\" parameter specifies if the job should auto-revert to the\n # last stable job on deployment failure. A job is marked as stable if all the\n # allocations as part of its deployment were marked healthy.\n auto_revert = true\n\n }\n\n # The \"group\" stanza defines a series of tasks that should be co-located on\n # the same Nomad client. Any task within a group will be placed on the same\n # client.\n #\n # For more information and examples on the \"group\" stanza, please see\n # the online documentation at:\n #\n # https://www.nomadproject.io/docs/job-specification/group\n #\n group \"prod-group1-prometheus\" {\n # The \"count\" parameter specifies the number of the task groups that should\n # be running under this group. This value must be non-negative and defaults\n # to 1.\n count = 4\n\n # The volume stanza allows the group to specify that it requires a given\n # volume from the cluster.\n #\n # For more information and examples on the \"volume\" stanza, please see\n # the online documentation at:\n #\n # https://www.nomadproject.io/docs/job-specification/volume\n #\n \n volume \"prod-volume1-prometheus\" {\n type = \"host\"\n read_only = false\n source = \"prod-volume-data1-1\"\n }\n \n\n # The constraint allows restricting the set of eligible nodes. Constraints\n # may filter on attributes or client metadata.\n #\n # For more information and examples on the \"volume\" stanza, please see\n # the online documentation at:\n #\n # https://www.nomadproject.io/docs/job-specification/constraint\n #\n constraint {\n attribute = \"${attr.cpu.arch}\"\n operator = \"!=\"\n value = \"arm64\"\n }\n\n # The \"task\" stanza creates an individual unit of work, such as a Docker\n # container, web application, or batch processing.\n #\n # For more information and examples on the \"task\" stanza, please see\n # the online documentation at:\n #\n # https://www.nomadproject.io/docs/job-specification/task\n #\n task \"prod-task1-prometheus\" {\n # The \"driver\" parameter specifies the task driver that should be used to\n # run the task.\n driver = \"exec\"\n\n \n volume_mount {\n volume = \"prod-volume1-prometheus\"\n destination = \"/data/\"\n read_only = false\n }\n \n\n \n\n # The \"config\" stanza specifies the driver configuration, which is passed\n # directly to the driver to start the task. The details of configurations\n # are specific to each driver, so please see specific driver\n # documentation for more information.\n config {\n command = \"local/prometheus-2.24.0.linux-amd64/prometheus\"\n args = [\n \"--config.file=secrets/prometheus.yml\",\n \"--storage.tsdb.path=/data/prometheus/\",\n \"--storage.tsdb.retention.time=15d\"\n ]\n }\n\n # The artifact stanza instructs Nomad to fetch and unpack a remote resource,\n # such as a file, tarball, or binary. Nomad downloads artifacts using the\n # popular go-getter library, which permits downloading artifacts from a\n # variety of locations using a URL as the input source.\n #\n # For more information and examples on the \"artifact\" stanza, please see\n # the online documentation at:\n #\n # https://www.nomadproject.io/docs/job-specification/artifact\n #\n artifact {\n source = \"https://github.com/prometheus/prometheus/releases/download/v2.24.0/prometheus-2.24.0.linux-amd64.tar.gz\"\n }\n\n # The \"template\" stanza instructs Nomad to manage a template, such as\n # a configuration file or script. This template can optionally pull data\n # from Consul or Vault to populate runtime configuration data.\n #\n # For more information and examples on the \"template\" stanza, please see\n # the online documentation at:\n #\n # https://www.nomadproject.io/docs/job-specification/template\n #\n template {\n change_mode = \"noop\"\n change_signal = \"SIGINT\"\n destination = \"secrets/alerts.yml\"\n left_delimiter = \"{{{\"\n right_delimiter = \"}}}\"\n data = \u003c\u003cEOH\n---\ngroups:\n- name: \"Consul\"\n rules:\n - alert: ConsulServiceHealthcheckFailed\n expr: consul_catalog_service_node_healthy == 0\n for: 0m\n labels:\n severity: critical\n annotations:\n summary: \"Consul service healthcheck failed (instance {{ $labels.instance }}).\"\n description: \"Service: `{{ $labels.service_name }}` Healthcheck: `{{ $labels.service_id }}`.\"\n - alert: ConsulMissingMasterNode\n expr: consul_raft_peers \u003c 3\n for: 0m\n labels:\n severity: critical\n annotations:\n summary: \"Consul missing master node (instance {{ $labels.instance }}).\"\n description: \"Numbers of consul raft peers should be 3, in order to preserve quorum.\"\n - alert: ConsulAgentUnhealthy\n expr: consul_health_node_status{status=\"critical\"} == 1\n for: 0m\n labels:\n severity: critical\n annotations:\n summary: \"Consul agent unhealthy (instance {{ $labels.instance }}).\"\n description: \"A Consul agent is down.\"\n- name: \"Hosts\"\n rules:\n - alert: NodeDown\n expr: up == 0\n for: 0m\n labels:\n severity: critical\n annotations:\n summary: \"Prometheus target missing (instance {{ $labels.instance }}).\"\n description: \"A Prometheus target has disappeared. An exporter might be crashed.\"\n - alert: HostHighCpuLoad\n expr: 100 - (avg by(instance) (rate(node_cpu_seconds_total{mode=\"idle\"}[5m])) * 100) \u003e 95\n for: 0m\n labels:\n severity: warning\n annotations:\n summary: \"Host high CPU load (instance {{ $labels.instance }}).\"\n description: \"CPU load is \u003e 95%.\"\n - alert: HostOutOfMemory\n expr: node_memory_MemAvailable_bytes / node_memory_MemTotal_bytes * 100 \u003c 10\n for: 2m\n labels:\n severity: warning\n annotations:\n summary: \"Host out of memory (instance {{ $labels.instance }}).\"\n description: \"Node memory is filling up (\u003c 10% left).\"\n - alert: HostOomKillDetected\n expr: increase(node_vmstat_oom_kill[1m]) \u003e 0\n for: 0m\n labels:\n severity: warning\n annotations:\n summary: \"Host OOM kill detected (instance {{ $labels.instance }}).\"\n description: \"OOM kill detected.\"\n - alert: HostMemoryUnderMemoryPressure\n expr: rate(node_vmstat_pgmajfault[1m]) \u003e 1000\n for: 2m\n labels:\n severity: warning\n annotations:\n summary: \"Host memory under memory pressure (instance {{ $labels.instance }}).\"\n description: \"The node is under heavy memory pressure. High rate of major page faults.\"\n - alert: HostOutOfDiskSpace\n expr: (node_filesystem_avail_bytes * 100) / node_filesystem_size_bytes \u003c 10 and ON (instance, device, mountpoint) node_filesystem_readonly == 0\n for: 2m\n labels:\n severity: warning\n annotations:\n summary: \"Host out of disk space (instance {{ $labels.instance }}).\"\n description: \"Disk is almost full (\u003c 10% left).\"\n - alert: HostRaidDiskFailure\n expr: node_md_disks{state=\"failed\"} \u003e 0\n for: 2m\n labels:\n severity: warning\n annotations:\n summary: \"Host RAID disk failure (instance {{ $labels.instance }}).\"\n description: \"At least one device in RAID array on {{ $labels.instance }} failed. Array {{ $labels.md_device }} needs attention and possibly a disk swap.\"\n - alert: HostConntrackLimit\n expr: node_nf_conntrack_entries / node_nf_conntrack_entries_limit \u003e 0.8\n for: 5m\n labels:\n severity: warning\n annotations:\n summary: \"Host conntrack limit (instance {{ $labels.instance }}).\"\n description: \"The number of conntrack is approching limit.\"\n - alert: HostNetworkInterfaceSaturated\n expr: (rate(node_network_receive_bytes_total{device!~\"^tap.*\"}[1m]) + rate(node_network_transmit_bytes_total{device!~\"^tap.*\"}[1m])) / node_network_speed_bytes{device!~\"^tap.*\"} \u003e 0.8\n for: 1m\n labels:\n severity: warning\n annotations:\n summary: \"Host Network Interface Saturated (instance {{ $labels.instance }}).\"\n description: \"The network interface {{ $labels.interface }} on {{ $labels.instance }} is getting overloaded.\"\n - alert: HostSystemdServiceCrashed\n expr: node_systemd_unit_state{state=\"failed\"} == 1\n for: 0m\n labels:\n severity: warning\n annotations:\n summary: \"Host SystemD service crashed (instance {{ $labels.instance }}).\"\n description: \"SystemD service crashed.\"\n - alert: HostEdacCorrectableErrorsDetected\n expr: increase(node_edac_correctable_errors_total[1m]) \u003e 0\n for: 0m\n labels:\n severity: info\n annotations:\n summary: \"Host EDAC Correctable Errors detected (instance {{ $labels.instance }}).\"\n description: '{{ $labels.instance }} has had {{ printf \"%.0f\" $value }} correctable memory errors reported by EDAC in the last 5 minutes.'\n - alert: HostEdacUncorrectableErrorsDetected\n expr: node_edac_uncorrectable_errors_total \u003e 0\n for: 0m\n labels:\n severity: warning\n annotations:\n summary: \"Host EDAC Uncorrectable Errors detected (instance {{ $labels.instance }}).\"\n description: '{{ $labels.instance }} has had {{ printf \"%.0f\" $value }} uncorrectable memory errors reported by EDAC in the last 5 minutes.'\n- name: \"Min.io\"\n rules:\n - alert: MinioDiskOffline\n expr: minio_offline_disks \u003e 0\n for: 0m\n labels:\n severity: critical\n annotations:\n summary: \"Minio disk offline (instance {{ $labels.instance }})\"\n description: \"Minio disk is offline.\"\n - alert: MinioStorageSpaceExhausted\n expr: minio_disk_storage_free_bytes / 1024 / 1024 / 1024 \u003c 10\n for: 2m\n labels:\n severity: warning\n annotations:\n summary: \"Minio storage space exhausted (instance {{ $labels.instance }}).\"\n description: \"Minio storage space is low (\u003c 10 GB).\"\n- name: \"Prometheus\"\n rules:\n - alert: PrometheusConfigurationReloadFailure\n expr: prometheus_config_last_reload_successful != 1\n for: 0m\n labels:\n severity: warning\n annotations:\n summary: \"Prometheus configuration reload failure (instance {{ $labels.instance }}).\"\n description: \"Prometheus configuration reload error.\"\n - alert: PrometheusTooManyRestarts\n expr: changes(process_start_time_seconds{job=~\"prometheus|pushgateway|alertmanager\"}[15m]) \u003e 2\n for: 0m\n labels:\n severity: warning\n annotations:\n summary: \"Prometheus too many restarts (instance {{ $labels.instance }}).\"\n description: \"Prometheus has restarted more than twice in the last 15 minutes. It might be crashlooping.\"\n - alert: PrometheusAlertmanagerConfigurationReloadFailure\n expr: alertmanager_config_last_reload_successful != 1\n for: 0m\n labels:\n severity: warning\n annotations:\n summary: \"Prometheus AlertManager configuration reload failure (instance {{ $labels.instance }}).\"\n description: \"AlertManager configuration reload error.\"\n - alert: PrometheusRuleEvaluationFailures\n expr: increase(prometheus_rule_evaluation_failures_total[3m]) \u003e 0\n for: 0m\n labels:\n severity: critical\n annotations:\n summary: \"Prometheus rule evaluation failures (instance {{ $labels.instance }}).\"\n description: \"Prometheus encountered {{ $value }} rule evaluation failures, leading to potentially ignored alerts.\"\n - alert: PrometheusTargetScrapingSlow\n expr: prometheus_target_interval_length_seconds{quantile=\"0.9\"} \u003e 60\n for: 5m\n labels:\n severity: warning\n annotations:\n summary: \"Prometheus target scraping slow (instance {{ $labels.instance }}).\"\n description: \"Prometheus is scraping exporters slowly.\"\n - alert: PrometheusTsdbCompactionsFailed\n expr: increase(prometheus_tsdb_compactions_failed_total[1m]) \u003e 0\n for: 0m\n labels:\n severity: critical\n annotations:\n summary: \"Prometheus TSDB compactions failed (instance {{ $labels.instance }}).\"\n description: \"Prometheus encountered {{ $value }} TSDB compactions failures.\"\n - alert: PrometheusTsdbHeadTruncationsFailed\n expr: increase(prometheus_tsdb_head_truncations_failed_total[1m]) \u003e 0\n for: 0m\n labels:\n severity: critical\n annotations:\n summary: \"Prometheus TSDB head truncations failed (instance {{ $labels.instance }}).\"\n description: \"Prometheus encountered {{ $value }} TSDB head truncation failures.\"\n - alert: PrometheusTsdbWalCorruptions\n expr: increase(prometheus_tsdb_wal_corruptions_total[1m]) \u003e 0\n for: 0m\n labels:\n severity: critical\n annotations:\n summary: \"Prometheus TSDB WAL corruptions (instance {{ $labels.instance }}).\"\n description: \"Prometheus encountered {{ $value }} TSDB WAL corruptions.\"\n - alert: PrometheusTsdbWalTruncationsFailed\n expr: increase(prometheus_tsdb_wal_truncations_failed_total[1m]) \u003e 0\n for: 0m\n labels:\n severity: critical\n annotations:\n summary: \"Prometheus TSDB WAL truncations failed (instance {{ $labels.instance }}).\"\n description: \"Prometheus encountered {{ $value }} TSDB WAL truncation failures.\"\nEOH\n }\n\n template {\n change_mode = \"noop\"\n change_signal = \"SIGINT\"\n destination = \"secrets/prometheus.yml\"\n data = \u003c\u003cEOH\n---\nglobal:\n scrape_interval: 5s\n scrape_timeout: 5s\n evaluation_interval: 5s\n\nalerting:\n alertmanagers:\n - consul_sd_configs:\n - server: '{{ env \"NOMAD_IP_prometheus\" }}:8500'\n services: [ 'alertmanager' ]\n\nrule_files:\n - 'alerts.yml'\n\nscrape_configs:\n\n - job_name: 'Nomad Cluster'\n consul_sd_configs:\n - server: '{{ env \"NOMAD_IP_prometheus\" }}:8500'\n services: [ 'nomad-client', 'nomad' ]\n relabel_configs:\n - source_labels: [__meta_consul_tags]\n regex: '(.*)http(.*)'\n action: keep\n metrics_path: /v1/metrics\n params:\n format: [ 'prometheus' ]\n\n - job_name: 'Consul Cluster'\n static_configs:\n - targets: [ '10.30.51.28:8500' ]\n - targets: [ '10.30.51.29:8500' ]\n - targets: [ '10.30.51.30:8500' ]\n - targets: [ '10.30.51.32:8500' ]\n - targets: [ '10.30.51.33:8500' ]\n - targets: [ '10.30.51.34:8500' ]\n - targets: [ '10.30.51.35:8500' ]\n - targets: [ '10.30.51.39:8500' ]\n - targets: [ '10.30.51.40:8500' ]\n - targets: [ '10.30.51.50:8500' ]\n - targets: [ '10.30.51.51:8500' ]\n - targets: [ '10.30.51.65:8500' ]\n - targets: [ '10.30.51.66:8500' ]\n - targets: [ '10.30.51.67:8500' ]\n - targets: [ '10.30.51.68:8500' ]\n - targets: [ '10.30.51.70:8500' ]\n - targets: [ '10.30.51.71:8500' ]\n - targets: [ '10.32.8.14:8500' ]\n - targets: [ '10.32.8.15:8500' ]\n - targets: [ '10.32.8.16:8500' ]\n - targets: [ '10.32.8.17:8500' ]\n metrics_path: /v1/agent/metrics\n params:\n format: [ 'prometheus' ]\n\n - job_name: 'Blackbox Exporter (icmp)'\n static_configs:\n - targets: [ 'gerrit.fd.io' ]\n - targets: [ 'jenkins.fd.io' ]\n - targets: [ '10.30.51.32' ]\n params:\n module: [ 'icmp_v4' ]\n relabel_configs:\n - source_labels: [__address__]\n target_label: __param_target\n - source_labels: [__param_target]\n target_label: instance\n - target_label: __address__\n replacement: localhost:9115\n metrics_path: /probe\n\n - job_name: 'Blackbox Exporter (http)'\n static_configs:\n - targets: [ 'gerrit.fd.io' ]\n - targets: [ 'jenkins.fd.io' ]\n params:\n module: [ 'http_2xx' ]\n relabel_configs:\n - source_labels: [__address__]\n target_label: __param_target\n - source_labels: [__param_target]\n target_label: instance\n - target_label: __address__\n replacement: localhost:9115\n metrics_path: /probe\n\n - job_name: 'cAdvisor Exporter'\n static_configs:\n - targets: [ '10.30.51.28:8080' ]\n - targets: [ '10.30.51.29:8080' ]\n - targets: [ '10.30.51.30:8080' ]\n #- targets: [ '10.30.51.32:8080' ]\n - targets: [ '10.30.51.33:8080' ]\n - targets: [ '10.30.51.34:8080' ]\n - targets: [ '10.30.51.35:8080' ]\n - targets: [ '10.30.51.39:8080' ]\n - targets: [ '10.30.51.40:8080' ]\n - targets: [ '10.30.51.50:8080' ]\n - targets: [ '10.30.51.51:8080' ]\n - targets: [ '10.30.51.65:8080' ]\n - targets: [ '10.30.51.66:8080' ]\n - targets: [ '10.30.51.67:8080' ]\n - targets: [ '10.30.51.68:8080' ]\n - targets: [ '10.30.51.70:8080' ]\n - targets: [ '10.30.51.71:8080' ]\n - targets: [ '10.32.8.14:8080' ]\n - targets: [ '10.32.8.15:8080' ]\n - targets: [ '10.32.8.16:8080' ]\n - targets: [ '10.32.8.17:8080' ]\n\n - job_name: 'Node Exporter'\n static_configs:\n - targets: [ '10.30.51.28:9100' ]\n - targets: [ '10.30.51.29:9100' ]\n - targets: [ '10.30.51.30:9100' ]\n - targets: [ '10.30.51.32:9100' ]\n - targets: [ '10.30.51.33:9100' ]\n - targets: [ '10.30.51.34:9100' ]\n - targets: [ '10.30.51.35:9100' ]\n - targets: [ '10.30.51.39:9100' ]\n - targets: [ '10.30.51.40:9100' ]\n - targets: [ '10.30.51.50:9100' ]\n - targets: [ '10.30.51.51:9100' ]\n - targets: [ '10.30.51.65:9100' ]\n - targets: [ '10.30.51.66:9100' ]\n - targets: [ '10.30.51.67:9100' ]\n - targets: [ '10.30.51.68:9100' ]\n - targets: [ '10.30.51.70:9100' ]\n - targets: [ '10.30.51.71:9100' ]\n - targets: [ '10.32.8.14:9100' ]\n - targets: [ '10.32.8.15:9100' ]\n - targets: [ '10.32.8.16:9100' ]\n - targets: [ '10.32.8.17:9100' ]\n\n - job_name: 'Alertmanager'\n consul_sd_configs:\n - server: '{{ env \"NOMAD_IP_prometheus\" }}:8500'\n services: [ 'alertmanager' ]\n\n - job_name: 'Grafana'\n consul_sd_configs:\n - server: '{{ env \"NOMAD_IP_prometheus\" }}:8500'\n services: [ 'grafana' ]\n\n - job_name: 'Prometheus'\n consul_sd_configs:\n - server: '{{ env \"NOMAD_IP_prometheus\" }}:8500'\n services: [ 'prometheus' ]\n\n - job_name: 'Minio'\n bearer_token: eyJhbGciOiJIUzUxMiIsInR5cCI6IkpXVCJ9.eyJleHAiOjQ3NjQ1ODEzMzcsImlzcyI6InByb21ldGhldXMiLCJzdWIiOiJtaW5pbyJ9.oeTw3EIaiFmlDikrHXWiWXMH2vxLfDLkfjEC7G2N3M_keH_xyA_l2ofLLNYtopa_3GCEZnxLQdPuFZrmgpkDWg\n consul_sd_configs:\n - server: '{{ env \"NOMAD_IP_prometheus\" }}:8500'\n services: [ 'storage' ]\n metrics_path: /minio/prometheus/metrics\nEOH\n }\n\n # The service stanza instructs Nomad to register a service with Consul.\n #\n # For more information and examples on the \"task\" stanza, please see\n # the online documentation at:\n #\n # https://www.nomadproject.io/docs/job-specification/service\n #\n service {\n name = \"prometheus\"\n port = \"prometheus\"\n tags = [ \"prometheus${NOMAD_ALLOC_INDEX}\" ]\n check {\n name = \"Prometheus Check Live\"\n type = \"http\"\n path = \"/-/healthy\"\n interval = \"10s\"\n timeout = \"2s\"\n }\n }\n\n # The \"resources\" stanza describes the requirements a task needs to\n # execute. Resource requirements include memory, network, cpu, and more.\n # This ensures the task will execute on a machine that contains enough\n # resource capacity.\n #\n # For more information and examples on the \"resources\" stanza, please see\n # the online documentation at:\n #\n # https://www.nomadproject.io/docs/job-specification/resources\n #\n resources {\n cpu = 2000\n memory = 8192\n # The network stanza specifies the networking requirements for the task\n # group, including the network mode and port allocations. When scheduling\n # jobs in Nomad they are provisioned across your fleet of machines along\n # with other jobs and services. Because you don't know in advance what host\n # your job will be provisioned on, Nomad will provide your tasks with\n # network configuration when they start up.\n #\n # For more information and examples on the \"template\" stanza, please see\n # the online documentation at:\n #\n # https://www.nomadproject.io/docs/job-specification/network\n #\n network {\n port \"prometheus\" {\n static = 9090\n }\n }\n }\n }\n }\n}", "json": null, - "modify_index": "7147932", + "modify_index": "7201194", "name": "prod-prometheus", "namespace": "default", "policy_override": null, @@ -760,10 +708,10 @@ "schema_version": 0, "attributes": { "allocation_ids": [ + "0239788e-a5eb-b54d-0cf2-2404b3ec7c53", "ecfcd9ad-b959-0fa4-c1ec-8b82195830f1", "190f5699-0d10-7f85-ac68-96927702070b", - "2fbbef45-f00a-1367-08c4-c2239de9e78d", - "dd57bc83-2229-d242-1caf-b743f186e7a2" + "2fbbef45-f00a-1367-08c4-c2239de9e78d" ], "datacenters": [ "yul1" |