9 files changed, 832 insertions, 80 deletions
diff --git a/fdio.infra.terraform/1n_nmd/alertmanager/conf/nomad/alertmanager.hcl.tftpl b/fdio.infra.terraform/1n_nmd/alertmanager/conf/nomad/alertmanager.hcl.tftpl
index d1bb8e85cd..87206ac5a0 100644
--- a/fdio.infra.terraform/1n_nmd/alertmanager/conf/nomad/alertmanager.hcl.tftpl
+++ b/fdio.infra.terraform/1n_nmd/alertmanager/conf/nomad/alertmanager.hcl.tftpl
@@ -346,9 +346,6 @@ EOH
 
       # The service stanza instructs Nomad to register a service with Consul.
       #
-      # For more information and examples on the "task" stanza, please see
-      # the online documentation at:
-      #
       #     https://www.nomadproject.io/docs/job-specification/service
       #
       service {
diff --git a/fdio.infra.terraform/1n_nmd/prometheus/conf/nomad/prometheus.hcl.tftpl b/fdio.infra.terraform/1n_nmd/prometheus/conf/nomad/prometheus.hcl.tftpl
new file mode 100644
index 0000000000..224f7e5e00
--- /dev/null
+++ b/fdio.infra.terraform/1n_nmd/prometheus/conf/nomad/prometheus.hcl.tftpl
@@ -0,0 +1,624 @@
+job "${job_name}" {
+  # The "region" parameter specifies the region in which to execute the job.
+  # If omitted, this inherits the default region name of "global".
+  # region    = "${region}"
+
+  # The "datacenters" parameter specifies the list of datacenters which should
+  # be considered when placing this task. This must be provided.
+  datacenters = "${datacenters}"
+
+  # The "type" parameter controls the type of job, which impacts the scheduler's
+  # decision on placement. This configuration is optional and defaults to
+  # "service". For a full list of job types and their differences, please see
+  # the online documentation.
+  #
+  #     https://www.nomadproject.io/docs/jobspec/schedulers
+  #
+  type        = "service"
+
+  update {
+    # The "max_parallel" parameter specifies the maximum number of updates to
+    # perform in parallel. In this case, this specifies to update a single task
+    # at a time.
+    max_parallel      = ${max_parallel}
+
+    health_check      = "checks"
+
+    # The "min_healthy_time" parameter specifies the minimum time the allocation
+    # must be in the healthy state before it is marked as healthy and unblocks
+    # further allocations from being updated.
+    min_healthy_time  = "10s"
+
+    # The "healthy_deadline" parameter specifies the deadline in which the
+    # allocation must be marked as healthy after which the allocation is
+    # automatically transitioned to unhealthy. Transitioning to unhealthy will
+    # fail the deployment and potentially roll back the job if "auto_revert" is
+    # set to true.
+    healthy_deadline  = "3m"
+
+    # The "progress_deadline" parameter specifies the deadline in which an
+    # allocation must be marked as healthy. The deadline begins when the first
+    # allocation for the deployment is created and is reset whenever an allocation
+    # as part of the deployment transitions to a healthy state. If no allocation
+    # transitions to the healthy state before the progress deadline, the
+    # deployment is marked as failed.
+    progress_deadline = "10m"
+
+%{ if use_canary }
+    # The "canary" parameter specifies that changes to the job that would result
+    # in destructive updates should create the specified number of canaries
+    # without stopping any previous allocations. Once the operator determines the
+    # canaries are healthy, they can be promoted which unblocks a rolling update
+    # of the remaining allocations at a rate of "max_parallel".
+    #
+    # Further, setting "canary" equal to the count of the task group allows
+    # blue/green deployments. When the job is updated, a full set of the new
+    # version is deployed and upon promotion the old version is stopped.
+    canary            = ${canary}
+
+    # Specifies if the job should auto-promote to the canary version when all
+    # canaries become healthy during a deployment. Defaults to false which means
+    # canaries must be manually updated with the nomad deployment promote
+    # command.
+    auto_promote      = ${auto_promote}
+
+    # The "auto_revert" parameter specifies if the job should auto-revert to the
+    # last stable job on deployment failure. A job is marked as stable if all the
+    # allocations as part of its deployment were marked healthy.
+    auto_revert       = ${auto_revert}
+%{ endif }
+  }
+
+  # The "group" stanza defines a series of tasks that should be co-located on
+  # the same Nomad client. Any task within a group will be placed on the same
+  # client.
+  #
+  #     https://www.nomadproject.io/docs/job-specification/group
+  #
+  group "${job_name}-group-1" {
+    # The "count" parameter specifies the number of the task groups that should
+    # be running under this group. This value must be non-negative and defaults
+    # to 1.
+    count = ${group_count}
+
+    # The volume stanza allows the group to specify that it requires a given
+    # volume from the cluster. The key of the stanza is the name of the volume
+    # as it will be exposed to task configuration.
+    #
+    # https://www.nomadproject.io/docs/job-specification/volume
+    %{ if use_host_volume }
+    volume "${job_name}-volume-1" {
+      type      = "host"
+      read_only = false
+      source    = "${volume_source}"
+    }
+    %{ endif }
+
+    # The restart stanza configures a tasks's behavior on task failure. Restarts
+    # happen on the client that is running the task.
+    #
+    # https://www.nomadproject.io/docs/job-specification/restart
+    #
+    restart {
+      interval  = "30m"
+      attempts  = 40
+      delay     = "15s"
+      mode      = "delay"
+    }
+
+    # The constraint allows restricting the set of eligible nodes. Constraints
+    # may filter on attributes or client metadata.
+    #
+    #     https://www.nomadproject.io/docs/job-specification/constraint
+    #
+    constraint {
+      attribute = "$${attr.cpu.arch}"
+      operator  = "!="
+      value     = "arm64"
+    }
+
+    constraint {
+      attribute = "$${node.class}"
+      value     = "builder"
+    }
+
+    # The network stanza specifies the networking requirements for the task
+    # group, including the network mode and port allocations. When scheduling
+    # jobs in Nomad they are provisioned across your fleet of machines along
+    # with other jobs and services. Because you don't know in advance what host
+    # your job will be provisioned on, Nomad will provide your tasks with
+    # network configuration when they start up.
+    #
+    #     https://www.nomadproject.io/docs/job-specification/network
+    #
+    network {
+      port "${service_name}" {
+        static = ${port}
+        to     = ${port}
+      }
+    }
+
+    # The "task" stanza creates an individual unit of work, such as a Docker
+    # container, web application, or batch processing.
+    #
+    #     https://www.nomadproject.io/docs/job-specification/task
+    #
+    task "${job_name}-task-1" {
+      # The "driver" parameter specifies the task driver that should be used to
+      # run the task.
+      driver = "exec"
+
+    %{ if use_host_volume }
+      volume_mount {
+        volume      = "${job_name}-volume-1"
+        destination = "${volume_destination}"
+        read_only   = false
+      }
+    %{ endif }
+
+    %{ if use_vault_provider }
+      vault {
+        policies = "${vault_kv_policy_name}"
+      }
+    %{ endif }
+
+      # The "config" stanza specifies the driver configuration, which is passed
+      # directly to the driver to start the task. The details of configurations
+      # are specific to each driver, so please see specific driver
+      # documentation for more information.
+      config {
+        command         = "local/prometheus-${version}.linux-amd64/prometheus"
+        args            = [
+          "--config.file=secrets/prometheus.yml",
+          "--storage.tsdb.path=${volume_destination}prometheus/",
+          "--storage.tsdb.retention.time=7d"
+        ]
+      }
+
+      # The artifact stanza instructs Nomad to fetch and unpack a remote resource,
+      # such as a file, tarball, or binary. Nomad downloads artifacts using the
+      # popular go-getter library, which permits downloading artifacts from a
+      # variety of locations using a URL as the input source.
+      #
+      #     https://www.nomadproject.io/docs/job-specification/artifact
+      #
+      artifact {
+        source = "${url}"
+      }
+
+      # The "template" stanza instructs Nomad to manage a template, such as
+      # a configuration file or script. This template can optionally pull data
+      # from Consul or Vault to populate runtime configuration data.
+      #
+      #     https://www.nomadproject.io/docs/job-specification/template
+      #
+      template {
+        change_mode     = "noop"
+        change_signal   = "SIGINT"
+        destination     = "secrets/alerts.yml"
+        left_delimiter  = "{{{"
+        right_delimiter = "}}}"
+        data            = <<EOH
+---
+groups:
+- name: "Jenkins Job Health Exporter"
+  rules:
+  - alert: JenkinsJobHealthExporterFailures
+    expr: jenkins_job_failure{id=~".*"} > jenkins_job_success{id=~".*"}
+    for: 0m
+    labels:
+      severity: critical
+    annotations:
+      summary: "Jenkins Job Health detected high failure rate on jenkins jobs."
+      description: "Job: {{ $labels.id }}"
+  - alert: JenkinsJobHealthExporterUnstable
+    expr: jenkins_job_unstable{id=~".*"} > jenkins_job_success{id=~".*"}
+    for: 0m
+    labels:
+      severity: warning
+    annotations:
+      summary: "Jenkins Job Health detected high unstable rate on jenkins jobs."
+      description: "Job: {{ $labels.id }}"
+- name: "Consul"
+  rules:
+  - alert: ConsulServiceHealthcheckFailed
+    expr: consul_catalog_service_node_healthy == 0
+    for: 0m
+    labels:
+      severity: critical
+    annotations:
+      summary: "Consul service healthcheck failed (instance {{ $labels.instance }})."
+      description: "Service: `{{ $labels.service_name }}` Healthcheck: `{{ $labels.service_id }}`."
+  - alert: ConsulMissingMasterNode
+    expr: consul_raft_peers < 3
+    for: 0m
+    labels:
+      severity: critical
+    annotations:
+      summary: "Consul missing master node (instance {{ $labels.instance }})."
+      description: "Numbers of consul raft peers should be 3, in order to preserve quorum."
+  - alert: ConsulAgentUnhealthy
+    expr: consul_health_node_status{status="critical"} == 1
+    for: 0m
+    labels:
+      severity: critical
+    annotations:
+      summary: "Consul agent unhealthy (instance {{ $labels.instance }})."
+      description: "A Consul agent is down."
+- name: "Hosts"
+  rules:
+  - alert: NodeDown
+    expr: up == 0
+    for: 0m
+    labels:
+      severity: critical
+    annotations:
+      summary: "Prometheus target missing (instance {{ $labels.instance }})."
+      description: "A Prometheus target has disappeared. An exporter might be crashed."
+  - alert: HostOutOfMemory
+    expr: node_memory_MemAvailable_bytes / node_memory_MemTotal_bytes * 100 < 10
+    for: 2m
+    labels:
+      severity: warning
+    annotations:
+      summary: "Host out of memory (instance {{ $labels.instance }})."
+      description: "Node memory is filling up (< 10% left)."
+  - alert: HostOomKillDetected
+    expr: increase(node_vmstat_oom_kill[1m]) > 0
+    for: 0m
+    labels:
+      severity: warning
+    annotations:
+      summary: "Host OOM kill detected (instance {{ $labels.instance }})."
+      description: "OOM kill detected."
+  - alert: HostMemoryUnderMemoryPressure
+    expr: rate(node_vmstat_pgmajfault[1m]) > 1000
+    for: 2m
+    labels:
+      severity: warning
+    annotations:
+      summary: "Host memory under memory pressure (instance {{ $labels.instance }})."
+      description: "The node is under heavy memory pressure. High rate of major page faults."
+  - alert: HostOutOfDiskSpace
+    expr: (node_filesystem_avail_bytes * 100) / node_filesystem_size_bytes < 10 and ON (instance, device, mountpoint) node_filesystem_readonly == 0
+    for: 2m
+    labels:
+      severity: warning
+    annotations:
+      summary: "Host out of disk space (instance {{ $labels.instance }})."
+      description: "Disk is almost full (< 10% left)."
+  - alert: HostRaidDiskFailure
+    expr: node_md_disks{state="failed"} > 0
+    for: 2m
+    labels:
+      severity: warning
+    annotations:
+      summary: "Host RAID disk failure (instance {{ $labels.instance }})."
+      description: "At least one device in RAID array on {{ $labels.instance }} failed. Array {{ $labels.md_device }} needs attention and possibly a disk swap."
+  - alert: HostConntrackLimit
+    expr: node_nf_conntrack_entries / node_nf_conntrack_entries_limit > 0.8
+    for: 5m
+    labels:
+      severity: warning
+    annotations:
+      summary: "Host conntrack limit (instance {{ $labels.instance }})."
+      description: "The number of conntrack is approching limit."
+  - alert: HostNetworkInterfaceSaturated
+    expr: (rate(node_network_receive_bytes_total{device!~"^tap.*"}[1m]) + rate(node_network_transmit_bytes_total{device!~"^tap.*"}[1m])) / node_network_speed_bytes{device!~"^tap.*"} > 0.8
+    for: 1m
+    labels:
+      severity: warning
+    annotations:
+      summary: "Host Network Interface Saturated (instance {{ $labels.instance }})."
+      description: "The network interface {{ $labels.interface }} on {{ $labels.instance }} is getting overloaded."
+  - alert: HostSystemdServiceCrashed
+    expr: node_systemd_unit_state{state="failed"} == 1
+    for: 0m
+    labels:
+      severity: warning
+    annotations:
+      summary: "Host SystemD service crashed (instance {{ $labels.instance }})."
+      description: "SystemD service crashed."
+  - alert: HostEdacCorrectableErrorsDetected
+    expr: increase(node_edac_correctable_errors_total[1m]) > 0
+    for: 0m
+    labels:
+      severity: info
+    annotations:
+      summary: "Host EDAC Correctable Errors detected (instance {{ $labels.instance }})."
+      description: '{{ $labels.instance }} has had {{ printf "%.0f" $value }} correctable memory errors reported by EDAC in the last 5 minutes.'
+  - alert: HostEdacUncorrectableErrorsDetected
+    expr: node_edac_uncorrectable_errors_total > 0
+    for: 0m
+    labels:
+      severity: warning
+    annotations:
+      summary: "Host EDAC Uncorrectable Errors detected (instance {{ $labels.instance }})."
+      description: '{{ $labels.instance }} has had {{ printf "%.0f" $value }} uncorrectable memory errors reported by EDAC in the last 5 minutes.'
+- name: "Min.io"
+  rules:
+  - alert: MinioDiskOffline
+    expr: minio_offline_disks > 0
+    for: 0m
+    labels:
+      severity: critical
+    annotations:
+      summary: "Minio disk offline (instance {{ $labels.instance }})"
+      description: "Minio disk is offline."
+  - alert: MinioStorageSpaceExhausted
+    expr: minio_disk_storage_free_bytes / 1024 / 1024 / 1024 < 10
+    for: 2m
+    labels:
+      severity: warning
+    annotations:
+      summary: "Minio storage space exhausted (instance {{ $labels.instance }})."
+      description: "Minio storage space is low (< 10 GB)."
+- name: "Prometheus"
+  rules:
+  - alert: PrometheusConfigurationReloadFailure
+    expr: prometheus_config_last_reload_successful != 1
+    for: 0m
+    labels:
+      severity: warning
+    annotations:
+      summary: "Prometheus configuration reload failure (instance {{ $labels.instance }})."
+      description: "Prometheus configuration reload error."
+  - alert: PrometheusTooManyRestarts
+    expr: changes(process_start_time_seconds{job=~"prometheus|pushgateway|alertmanager"}[15m]) > 2
+    for: 0m
+    labels:
+      severity: warning
+    annotations:
+      summary: "Prometheus too many restarts (instance {{ $labels.instance }})."
+      description: "Prometheus has restarted more than twice in the last 15 minutes. It might be crashlooping."
+  - alert: PrometheusAlertmanagerConfigurationReloadFailure
+    expr: alertmanager_config_last_reload_successful != 1
+    for: 0m
+    labels:
+      severity: warning
+    annotations:
+      summary: "Prometheus AlertManager configuration reload failure (instance {{ $labels.instance }})."
+      description: "AlertManager configuration reload error."
+  - alert: PrometheusRuleEvaluationFailures
+    expr: increase(prometheus_rule_evaluation_failures_total[3m]) > 0
+    for: 0m
+    labels:
+      severity: critical
+    annotations:
+      summary: "Prometheus rule evaluation failures (instance {{ $labels.instance }})."
+      description: "Prometheus encountered {{ $value }} rule evaluation failures, leading to potentially ignored alerts."
+  - alert: PrometheusTargetScrapingSlow
+    expr: prometheus_target_interval_length_seconds{quantile="0.9"} > 60
+    for: 5m
+    labels:
+      severity: warning
+    annotations:
+      summary: "Prometheus target scraping slow (instance {{ $labels.instance }})."
+      description: "Prometheus is scraping exporters slowly."
+  - alert: PrometheusTsdbCompactionsFailed
+    expr: increase(prometheus_tsdb_compactions_failed_total[1m]) > 0
+    for: 0m
+    labels:
+      severity: critical
+    annotations:
+      summary: "Prometheus TSDB compactions failed (instance {{ $labels.instance }})."
+      description: "Prometheus encountered {{ $value }} TSDB compactions failures."
+  - alert: PrometheusTsdbHeadTruncationsFailed
+    expr: increase(prometheus_tsdb_head_truncations_failed_total[1m]) > 0
+    for: 0m
+    labels:
+      severity: critical
+    annotations:
+      summary: "Prometheus TSDB head truncations failed (instance {{ $labels.instance }})."
+      description: "Prometheus encountered {{ $value }} TSDB head truncation failures."
+  - alert: PrometheusTsdbWalCorruptions
+    expr: increase(prometheus_tsdb_wal_corruptions_total[1m]) > 0
+    for: 0m
+    labels:
+      severity: critical
+    annotations:
+      summary: "Prometheus TSDB WAL corruptions (instance {{ $labels.instance }})."
+      description: "Prometheus encountered {{ $value }} TSDB WAL corruptions."
+  - alert: PrometheusTsdbWalTruncationsFailed
+    expr: increase(prometheus_tsdb_wal_truncations_failed_total[1m]) > 0
+    for: 0m
+    labels:
+      severity: critical
+    annotations:
+      summary: "Prometheus TSDB WAL truncations failed (instance {{ $labels.instance }})."
+      description: "Prometheus encountered {{ $value }} TSDB WAL truncation failures."
+EOH
+      }
+
+      template {
+        change_mode     = "noop"
+        change_signal   = "SIGINT"
+        destination     = "secrets/prometheus.yml"
+        data            = <<EOH
+---
+global:
+  scrape_interval:     5s
+  scrape_timeout:      5s
+  evaluation_interval: 5s
+
+alerting:
+  alertmanagers:
+  - consul_sd_configs:
+    - server: '{{ env "NOMAD_IP_prometheus" }}:8500'
+      services: [ 'alertmanager' ]
+
+rule_files:
+  - 'alerts.yml'
+
+scrape_configs:
+
+  - job_name: 'Nomad Cluster'
+    consul_sd_configs:
+    - server: '{{ env "NOMAD_IP_prometheus" }}:8500'
+      services: [ 'nomad-client', 'nomad' ]
+    relabel_configs:
+    - source_labels: [__meta_consul_tags]
+      regex: '(.*)http(.*)'
+      action: keep
+    metrics_path: /v1/metrics
+    params:
+      format: [ 'prometheus' ]
+
+  - job_name: 'Consul Cluster'
+    static_configs:
+      - targets: [ '10.30.51.22:8500' ]
+      - targets: [ '10.30.51.24:8500' ]
+      - targets: [ '10.30.51.25:8500' ]
+      - targets: [ '10.30.51.26:8500' ]
+      - targets: [ '10.30.51.28:8500' ]
+      - targets: [ '10.30.51.29:8500' ]
+      - targets: [ '10.30.51.30:8500' ]
+      - targets: [ '10.30.51.39:8500' ]
+      - targets: [ '10.30.51.40:8500' ]
+      - targets: [ '10.30.51.50:8500' ]
+      - targets: [ '10.30.51.51:8500' ]
+      - targets: [ '10.30.51.65:8500' ]
+      - targets: [ '10.30.51.66:8500' ]
+      - targets: [ '10.30.51.67:8500' ]
+      - targets: [ '10.30.51.68:8500' ]
+      - targets: [ '10.30.51.70:8500' ]
+      - targets: [ '10.30.51.71:8500' ]
+      - targets: [ '10.32.8.14:8500' ]
+      - targets: [ '10.32.8.15:8500' ]
+      - targets: [ '10.32.8.16:8500' ]
+      - targets: [ '10.32.8.17:8500' ]
+    metrics_path: /v1/agent/metrics
+    params:
+      format: [ 'prometheus' ]
+
+  - job_name: 'Blackbox Exporter (icmp)'
+    static_configs:
+      - targets: [ 'gerrit.fd.io' ]
+      - targets: [ 'jenkins.fd.io' ]
+      - targets: [ '10.32.8.17' ]
+    params:
+      module: [ 'icmp_v4' ]
+    relabel_configs:
+      - source_labels: [__address__]
+        target_label: __param_target
+      - source_labels: [__param_target]
+        target_label: instance
+      - target_label: __address__
+        replacement: localhost:9115
+    metrics_path: /probe
+
+  - job_name: 'Blackbox Exporter (http)'
+    static_configs:
+      - targets: [ 'gerrit.fd.io' ]
+      - targets: [ 'jenkins.fd.io' ]
+    params:
+      module: [ 'http_2xx' ]
+    relabel_configs:
+      - source_labels: [__address__]
+        target_label: __param_target
+      - source_labels: [__param_target]
+        target_label: instance
+      - target_label: __address__
+        replacement: localhost:9115
+    metrics_path: /probe
+
+  - job_name: 'Jenkins Job Health Exporter'
+    static_configs:
+      - targets: [ '10.30.51.22:9186' ]
+    metric_relabel_configs:
+      - source_labels: [ __name__ ]
+        regex: '^(vpp.*|csit.*)_(success|failure|total|unstable|reqtime_ms)$'
+        action: replace
+        replacement: '$1'
+        target_label: id
+      - source_labels: [ __name__ ]
+        regex: '^(vpp.*|csit.*)_(success|failure|total|unstable|reqtime_ms)$'
+        replacement: 'jenkins_job_$2'
+        target_label: __name__
+
+  - job_name: 'Node Exporter'
+    static_configs:
+      - targets: [ '10.30.51.22:9100' ]
+      - targets: [ '10.30.51.24:9100' ]
+      - targets: [ '10.30.51.25:9100' ]
+      - targets: [ '10.30.51.26:9100' ]
+      - targets: [ '10.30.51.28:9100' ]
+      - targets: [ '10.30.51.29:9100' ]
+      - targets: [ '10.30.51.30:9100' ]
+      - targets: [ '10.30.51.39:9100' ]
+      - targets: [ '10.30.51.40:9100' ]
+      - targets: [ '10.30.51.50:9100' ]
+      - targets: [ '10.30.51.51:9100' ]
+      - targets: [ '10.30.51.65:9100' ]
+      - targets: [ '10.30.51.66:9100' ]
+      - targets: [ '10.30.51.67:9100' ]
+      - targets: [ '10.30.51.68:9100' ]
+      - targets: [ '10.30.51.70:9100' ]
+      - targets: [ '10.30.51.71:9100' ]
+      - targets: [ '10.32.8.14:9100' ]
+      - targets: [ '10.32.8.15:9100' ]
+      - targets: [ '10.32.8.16:9100' ]
+      - targets: [ '10.32.8.17:9100' ]
+
+  - job_name: 'Alertmanager'
+    consul_sd_configs:
+    - server: '{{ env "NOMAD_IP_prometheus" }}:8500'
+      services: [ 'alertmanager' ]
+
+  - job_name: 'Grafana'
+    consul_sd_configs:
+    - server: '{{ env "NOMAD_IP_prometheus" }}:8500'
+      services: [ 'grafana' ]
+
+  - job_name: 'Prometheus'
+    consul_sd_configs:
+    - server: '{{ env "NOMAD_IP_prometheus" }}:8500'
+      services: [ 'prometheus' ]
+
+  - job_name: 'Minio'
+    bearer_token: eyJhbGciOiJIUzUxMiIsInR5cCI6IkpXVCJ9.eyJleHAiOjQ3NjQ1ODEzMzcsImlzcyI6InByb21ldGhldXMiLCJzdWIiOiJtaW5pbyJ9.oeTw3EIaiFmlDikrHXWiWXMH2vxLfDLkfjEC7G2N3M_keH_xyA_l2ofLLNYtopa_3GCEZnxLQdPuFZrmgpkDWg
+    consul_sd_configs:
+    - server: '{{ env "NOMAD_IP_prometheus" }}:8500'
+      services: [ 'storage' ]
+    metrics_path: /minio/prometheus/metrics
+
+  - job_name: 'Minio Proxy'
+    bearer_token: eyJhbGciOiJIUzUxMiIsInR5cCI6IkpXVCJ9.eyJleHAiOjQ3OTAwNjE1NDIsImlzcyI6InByb21ldGhldXMiLCJzdWIiOiJBS0lBUTJBSDdZUFBXVDZDV1hYSSJ9.CU9x9j-yO0_Uta5iep6yqNiGQPolrr2608E3lpU6Yg21rIv_eOwS5zqzXaSvrhzkJP9H5kO1Pj6kqjYhbqjN_w
+    consul_sd_configs:
+    - server: '{{ env "NOMAD_IP_prometheus" }}:8500'
+      services: [ 'minio' ]
+    metrics_path: /minio/v2/metrics/cluster
+EOH
+      }
+
+      # The service stanza instructs Nomad to register a service with Consul.
+      #
+      #     https://www.nomadproject.io/docs/job-specification/service
+      #
+      service {
+        name       = "${service_name}"
+        port       = "${service_name}"
+        tags       = [ "${service_name}$${NOMAD_ALLOC_INDEX}" ]
+        check {
+          name     = "Prometheus Check Live"
+          type     = "http"
+          path     = "/-/healthy"
+          interval = "10s"
+          timeout  = "2s"
+        }
+      }
+
+      # The "resources" stanza describes the requirements a task needs to
+      # execute. Resource requirements include memory, network, cpu, and more.
+      # This ensures the task will execute on a machine that contains enough
+      # resource capacity.
+      #
+      #     https://www.nomadproject.io/docs/job-specification/resources
+      #
+      resources {
+        cpu    = ${cpu}
+        memory = ${memory}
+      }
+    }
+  }
+}
diff --git a/fdio.infra.terraform/1n_nmd/prometheus/fdio/main.tf b/fdio.infra.terraform/1n_nmd/prometheus/fdio/main.tf
new file mode 100644
index 0000000000..e0ca417a78
--- /dev/null
+++ b/fdio.infra.terraform/1n_nmd/prometheus/fdio/main.tf
@@ -0,0 +1,10 @@
+module "prometheus" {
+  providers = {
+    nomad = nomad.yul1
+  }
+  source = "../"
+
+  # prometheus
+  datacenters = ["yul1"]
+  pm_version  = "2.33.1"
+}
+\ No newline at end of file
diff --git a/fdio.infra.terraform/1n_nmd/prometheus/fdio/providers.tf b/fdio.infra.terraform/1n_nmd/prometheus/fdio/providers.tf
new file mode 100644
index 0000000000..42a6a45ce0
--- /dev/null
+++ b/fdio.infra.terraform/1n_nmd/prometheus/fdio/providers.tf
@@ -0,0 +1,13 @@
+provider "nomad" {
+  address = var.nomad_provider_address
+  alias   = "yul1"
+  #  ca_file   = var.nomad_provider_ca_file
+  #  cert_file = var.nomad_provider_cert_file
+  #  key_file  = var.nomad_provider_key_file
+}
+
+provider "vault" {
+  address         = var.vault_provider_address
+  skip_tls_verify = var.vault_provider_skip_tls_verify
+  token           = var.vault_provider_token
+}
+\ No newline at end of file
diff --git a/fdio.infra.terraform/1n_nmd/prometheus/fdio/variables.tf b/fdio.infra.terraform/1n_nmd/prometheus/fdio/variables.tf
new file mode 100644
index 0000000000..7d5be09d21
--- /dev/null
+++ b/fdio.infra.terraform/1n_nmd/prometheus/fdio/variables.tf
@@ -0,0 +1,47 @@
+variable "nomad_acl" {
+  description = "Nomad ACLs enabled/disabled."
+  type        = bool
+  default     = false
+}
+
+variable "nomad_provider_address" {
+  description = "FD.io Nomad cluster address."
+  type        = string
+  default     = "http://10.32.8.14:4646"
+}
+
+variable "nomad_provider_ca_file" {
+  description = "A local file path to a PEM-encoded certificate authority."
+  type        = string
+  default     = "/etc/nomad.d/ssl/nomad-ca.pem"
+}
+
+variable "nomad_provider_cert_file" {
+  description = "A local file path to a PEM-encoded certificate."
+  type        = string
+  default     = "/etc/nomad.d/ssl/nomad-cli.pem"
+}
+
+variable "nomad_provider_key_file" {
+  description = "A local file path to a PEM-encoded private key."
+  type        = string
+  default     = "/etc/nomad.d/ssl/nomad-cli-key.pem"
+}
+
+variable "vault_provider_address" {
+  description = "Vault cluster address."
+  type        = string
+  default     = "http://10.30.51.28:8200"
+}
+
+variable "vault_provider_skip_tls_verify" {
+  description = "Verification of the Vault server's TLS certificate."
+  type        = bool
+  default     = false
+}
+
+variable "vault_provider_token" {
+  description = "Vault root token."
+  type        = string
+  sensitive   = true
+}
+\ No newline at end of file
diff --git a/fdio.infra.terraform/1n_nmd/prometheus/fdio/versions.tf b/fdio.infra.terraform/1n_nmd/prometheus/fdio/versions.tf
new file mode 100644
index 0000000000..f83709d154
--- /dev/null
+++ b/fdio.infra.terraform/1n_nmd/prometheus/fdio/versions.tf
@@ -0,0 +1,17 @@
+terraform {
+  backend "consul" {
+    address = "10.32.8.14:8500"
+    scheme  = "http"
+    path    = "terraform/prometheus"
+  }
+  required_providers {
+    nomad = {
+      source  = "hashicorp/nomad"
+      version = ">= 1.4.16"
+    }
+    vault = {
+      version = ">= 3.2.1"
+    }
+  }
+  required_version = ">= 1.1.4"
+}
+\ No newline at end of file
diff --git a/fdio.infra.terraform/1n_nmd/prometheus/main.tf b/fdio.infra.terraform/1n_nmd/prometheus/main.tf
index 0c504c9ca4..0a4d987831 100644
--- a/fdio.infra.terraform/1n_nmd/prometheus/main.tf
+++ b/fdio.infra.terraform/1n_nmd/prometheus/main.tf
@@ -1,37 +1,42 @@
 locals {
-  datacenters = join(",", var.nomad_datacenters)
-
-  prometheus_url = join("",
+  datacenters = join(",", var.datacenters)
+  url = join("",
     [
       "https://github.com",
       "/prometheus/prometheus/releases/download/",
-      "v${var.prometheus_version}/",
-      "prometheus-${var.prometheus_version}.linux-amd64.tar.gz"
+      "v${var.pm_version}/",
+      "prometheus-${var.pm_version}.linux-amd64.tar.gz"
     ]
   )
 }
 
-data "template_file" "nomad_job_prometheus" {
-  template = file("${path.module}/conf/nomad/prometheus.hcl")
-  vars = {
-    datacenters        = local.datacenters
-    url                = local.prometheus_url
-    job_name           = var.prometheus_job_name
-    use_canary         = var.prometheus_use_canary
-    group_count        = var.prometheus_group_count
-    use_host_volume    = var.prometheus_use_host_volume
-    host_volume        = var.nomad_host_volume
-    data_dir           = var.prometheus_data_dir
-    service_name       = var.prometheus_service_name
-    use_vault_provider = var.prometheus_vault_secret.use_vault_provider
-    version            = var.prometheus_version
-    cpu                = var.prometheus_cpu
-    mem                = var.prometheus_mem
-    port               = var.prometheus_port
-  }
-}
-
 resource "nomad_job" "nomad_job_prometheus" {
-  jobspec = data.template_file.nomad_job_prometheus.rendered
-  detach  = false
-}
-\ No newline at end of file
+  jobspec = templatefile(
+    "${path.module}/conf/nomad/prometheus.hcl.tftpl",
+    {
+      auto_promote              = var.auto_promote,
+      auto_revert               = var.auto_revert,
+      canary                    = var.canary,
+      cpu                       = var.cpu,
+      datacenters               = local.datacenters,
+      group_count               = var.group_count,
+      job_name                  = var.job_name,
+      max_parallel              = var.max_parallel,
+      memory                    = var.memory
+      port                      = var.port,
+      region                    = var.region,
+      service_name              = var.service_name,
+      url                       = local.url,
+      use_canary                = var.use_canary,
+      use_host_volume           = var.use_host_volume,
+      use_vault_provider        = var.vault_secret.use_vault_provider,
+      vault_kv_policy_name      = var.vault_secret.vault_kv_policy_name,
+      vault_kv_path             = var.vault_secret.vault_kv_path,
+      vault_kv_field_access_key = var.vault_secret.vault_kv_field_access_key,
+      vault_kv_field_secret_key = var.vault_secret.vault_kv_field_secret_key,
+      version                   = var.pm_version,
+      volume_destination        = var.volume_destination,
+      volume_source             = var.volume_source
+  })
+  detach = false
+}
diff --git a/fdio.infra.terraform/1n_nmd/prometheus/variables.tf b/fdio.infra.terraform/1n_nmd/prometheus/variables.tf
index befd9412cb..d44f9d5f8a 100644
--- a/fdio.infra.terraform/1n_nmd/prometheus/variables.tf
+++ b/fdio.infra.terraform/1n_nmd/prometheus/variables.tf
@@ -1,84 +1,127 @@
 # Nomad
-variable "nomad_datacenters" {
-  description = "Nomad data centers"
+variable "datacenters" {
+  description = "Specifies the list of DCs to be considered placing this task"
   type        = list(string)
   default     = ["dc1"]
 }
 
-variable "nomad_host_volume" {
-  description = "Nomad Host Volume"
+variable "region" {
+  description = "Specifies the list of DCs to be considered placing this task"
+  type        = string
+  default     = "global"
+}
+
+variable "volume_source" {
+  description = "The name of the volume to request"
   type        = string
   default     = "persistence"
 }
 
 # Prometheus
-variable "prometheus_job_name" {
-  description = "Prometheus job name"
+variable "pm_version" {
+  description = "Prometheus version"
   type        = string
-  default     = "prometheus"
+  default     = "2.33.1"
+}
+
+variable "auto_promote" {
+  description = "Specifies if the job should auto-promote to the canary version"
+  type        = bool
+  default     = true
+}
+
+variable "auto_revert" {
+  description = "Specifies if the job should auto-revert to the last stable job"
+  type        = bool
+  default     = true
 }
 
-variable "prometheus_group_count" {
-  description = "Number of prometheus group instances"
+variable "canary" {
+  description = "Equal to the count of the task group allows blue/green depl."
   type        = number
   default     = 1
 }
 
-variable "prometheus_service_name" {
-  description = "Prometheus service name"
-  type        = string
-  default     = "prometheus"
+variable "cpu" {
+  description = "CPU allocation"
+  type        = number
+  default     = 2000
 }
 
-variable "prometheus_version" {
-  description = "Prometheus version"
+variable "data_dir" {
+  description = "Prometheus DISK allocation"
   type        = string
-  default     = "v2.28.1"
+  default     = "/data"
 }
 
-variable "prometheus_use_canary" {
-  description = "Uses canary deployment"
-  type        = bool
-  default     = false
+variable "group_count" {
+  description = "Specifies the number of the task groups running under this one"
+  type        = number
+  default     = 4
 }
 
-variable "prometheus_vault_secret" {
-  description = "Set of properties to be able to fetch secret from vault"
-  type = object({
-    use_vault_provider        = bool,
-    vault_kv_policy_name      = string,
-    vault_kv_path             = string,
-    vault_kv_field_access_key = string,
-    vault_kv_field_secret_key = string
-  })
+variable "job_name" {
+  description = "Specifies a name for the job"
+  type        = string
+  default     = "prometheus"
 }
 
-variable "prometheus_cpu" {
-  description = "Prometheus CPU allocation"
+variable "max_parallel" {
+  description = "Specifies the maximum number of updates to perform in parallel"
   type        = number
-  default     = 2000
+  default     = 1
 }
 
-variable "prometheus_mem" {
-  description = "Prometheus RAM allocation"
+variable "memory" {
+  description = "Specifies the memory required in MB"
   type        = number
-  default     = 8192
+  default     = 4096
 }
 
-variable "prometheus_port" {
-  description = "Prometheus TCP allocation"
+variable "port" {
+  description = "Specifies the static TCP/UDP port to allocate"
   type        = number
-  default     = 9200
+  default     = 9090
 }
 
-variable "prometheus_data_dir" {
-  description = "Prometheus DISK allocation"
+variable "service_name" {
+  description = "Specifies the name this service will be advertised in Consul"
   type        = string
-  default     = "/data"
+  default     = "prometheus"
+}
+
+variable "use_canary" {
+  description = "Uses canary deployment"
+  type        = bool
+  default     = true
 }
 
-variable "prometheus_use_host_volume" {
+variable "use_host_volume" {
   description = "Use Nomad host volume feature"
   type        = bool
   default     = false
-}
-\ No newline at end of file
+}
+
+variable "volume_destination" {
+  description = "Specifies where the volume should be mounted inside the task"
+  type        = string
+  default     = "/data/"
+}
+
+variable "vault_secret" {
+  type = object({
+    use_vault_provider        = bool,
+    vault_kv_policy_name      = string,
+    vault_kv_path             = string,
+    vault_kv_field_access_key = string,
+    vault_kv_field_secret_key = string
+  })
+  description = "Set of properties to be able to fetch secret from vault."
+  default = {
+    use_vault_provider        = false
+    vault_kv_policy_name      = "kv"
+    vault_kv_path             = "secret/data/prometheus"
+    vault_kv_field_access_key = "access_key"
+    vault_kv_field_secret_key = "secret_key"
+  }
+}
diff --git a/fdio.infra.terraform/1n_nmd/prometheus/versions.tf b/fdio.infra.terraform/1n_nmd/prometheus/versions.tf
index b80610a525..a01708f28a 100644
--- a/fdio.infra.terraform/1n_nmd/prometheus/versions.tf
+++ b/fdio.infra.terraform/1n_nmd/prometheus/versions.tf
@@ -2,12 +2,8 @@ terraform {
   required_providers {
     nomad = {
       source  = "hashicorp/nomad"
-      version = "~> 1.4.15"
-    }
-    template = {
-      source  = "hashicorp/template"
-      version = "~> 2.2.0"
+      version = ">= 1.4.16"
     }
   }
-  required_version = ">= 1.0.3"
+  required_version = ">= 1.1.4"
 }