job "${job_name}" {
  # The "region" parameter specifies the region in which to execute the job.
  # If omitted, this inherits the default region name of "global".
  # region    = "${region}"

  # The "datacenters" parameter specifies the list of datacenters which should
  # be considered when placing this task. This must be provided.
  datacenters = "${datacenters}"

  # The "type" parameter controls the type of job, which impacts the scheduler's
  # decision on placement.
  #
  # https://www.nomadproject.io/docs/jobspec/schedulers
  #
  type        = "service"

  update {
    # The "max_parallel" parameter specifies the maximum number of updates to
    # perform in parallel.
    max_parallel      = ${max_parallel}

    health_check      = "checks"

    # The "min_healthy_time" parameter specifies the minimum time the allocation
    # must be in the healthy state before it is marked as healthy and unblocks
    # further allocations from being updated.
    min_healthy_time  = "10s"

    # The "healthy_deadline" parameter specifies the deadline in which the
    # allocation must be marked as healthy after which the allocation is
    # automatically transitioned to unhealthy. Transitioning to unhealthy will
    # fail the deployment and potentially roll back the job if "auto_revert" is
    # set to true.
    healthy_deadline  = "3m"

    # The "progress_deadline" parameter specifies the deadline in which an
    # allocation must be marked as healthy. The deadline begins when the first
    # allocation for the deployment is created and is reset whenever an allocation
    # as part of the deployment transitions to a healthy state. If no allocation
    # transitions to the healthy state before the progress deadline, the
    # deployment is marked as failed.
    progress_deadline = "10m"

%{ if use_canary }
    # The "canary" parameter specifies that changes to the job that would result
    # in destructive updates should create the specified number of canaries
    # without stopping any previous allocations. Once the operator determines the
    # canaries are healthy, they can be promoted which unblocks a rolling update
    # of the remaining allocations at a rate of "max_parallel".
    #
    # Further, setting "canary" equal to the count of the task group allows
    # blue/green deployments. When the job is updated, a full set of the new
    # version is deployed and upon promotion the old version is stopped.
    canary            = ${canary}

    # Specifies if the job should auto-promote to the canary version when all
    # canaries become healthy during a deployment. Defaults to false which means
    # canaries must be manually updated with the nomad deployment promote
    # command.
    auto_promote      = ${auto_promote}

    # The "auto_revert" parameter specifies if the job should auto-revert to the
    # last stable job on deployment failure. A job is marked as stable if all the
    # allocations as part of its deployment were marked healthy.
    auto_revert       = ${auto_revert}
%{ endif }
  }

  # The "group" stanza defines a series of tasks that should be co-located on
  # the same Nomad client. Any task within a group will be placed on the same
  # client.
  #
  # https://www.nomadproject.io/docs/job-specification/group
  #
  group "${job_name}-group-1" {
    # The "count" parameter specifies the number of the task groups that should
    # be running under this group. This value must be non-negative.
    count = ${group_count}

    # The volume stanza allows the group to specify that it requires a given
    # volume from the cluster. The key of the stanza is the name of the volume
    # as it will be exposed to task configuration.
    #
    # https://www.nomadproject.io/docs/job-specification/volume
    #
    %{ if use_host_volume }
    volume "${job_name}-volume-1" {
      type      = "host"
      read_only = false
      source    = "${volume_source}"
    }
    %{ endif }

    # The restart stanza configures a tasks's behavior on task failure. Restarts
    # happen on the client that is running the task.
    #
    # https://www.nomadproject.io/docs/job-specification/restart
    #
    restart {
      interval = "30m"
      attempts = 40
      delay    = "15s"
      mode     = "delay"
    }

    # The constraint allows restricting the set of eligible nodes. Constraints
    # may filter on attributes or client metadata.
    #
    # https://www.nomadproject.io/docs/job-specification/constraint
    #
    constraint {
      attribute = "$${attr.cpu.arch}"
      operator  = "!="
      value     = "arm64"
    }
    constraint {
      attribute = "$${node.class}"
      value     = "builder"
    }

    # The network stanza specifies the networking requirements for the task
    # group, including the network mode and port allocations. When scheduling
    # jobs in Nomad they are provisioned across your fleet of machines along
    # with other jobs and services. Because you don't know in advance what host
    # your job will be provisioned on, Nomad will provide your tasks with
    # network configuration when they start up.
    #
    # https://www.nomadproject.io/docs/job-specification/network
    #
    network {
      port "${service_name}" {
        static = ${port}
        to     = ${port}
      }
    }

    # The "task" stanza creates an individual unit of work, such as a Docker
    # container, web application, or batch processing.
    #
    # https://www.nomadproject.io/docs/job-specification/task
    #
    task "${job_name}-task-1" {
      # The "driver" parameter specifies the task driver that should be used to
      # run the task.
      driver = "exec"

      %{ if use_host_volume }
      volume_mount {
        volume      = "${job_name}-volume-1"
        destination = "${volume_destination}"
        read_only   = false
      }
      %{ endif }

      %{ if use_vault_provider }
      vault {
        policies = "${vault_kv_policy_name}"
      }
      %{ endif }

      # The "config" stanza specifies the driver configuration, which is passed
      # directly to the driver to start the task. The details of configurations
      # are specific to each driver, so please see specific driver
      # documentation for more information.
      config {
        command = "local/prometheus-${version}.linux-amd64/prometheus"
        args    = [
          "--config.file=secrets/prometheus.yml",
          "--web.config.file=secrets/web-config.yml",
          "--storage.tsdb.path=${volume_destination}prometheus/",
          "--storage.tsdb.retention.time=7d"
        ]
      }

      # The artifact stanza instructs Nomad to fetch and unpack a remote
      # resource, such as a file, tarball, or binary. Nomad downloads artifacts
      # using the popular go-getter library, which permits downloading artifacts
      # from a variety of locations using a URL as the input source.
      #
      # https://www.nomadproject.io/docs/job-specification/artifact
      #
      artifact {
        source = "${artifact_source}"
        options {
          checksum = "sha256:${artifact_source_checksum}"
        }
      }

      # The "template" stanza instructs Nomad to manage a template, such as
      # a configuration file or script. This template can optionally pull data
      # from Consul or Vault to populate runtime configuration data.
      #
      # https://www.nomadproject.io/docs/job-specification/template
      #
      template {
        change_mode     = "noop"
        change_signal   = "SIGINT"
        destination     = "secrets/cert_file.crt"
        left_delimiter  = "{{{"
        right_delimiter = "}}}"
        data            = <<EOH
-----BEGIN CERTIFICATE-----
MIIFszCCA5ugAwIBAgIUDtmFbbnYaXbXH5ddtHi9l25wM7owDQYJKoZIhvcNAQEL
BQAwaTELMAkGA1UEBhMCU0sxEzARBgNVBAgMClNvbWUtU3RhdGUxITAfBgNVBAoM
GEludGVybmV0IFdpZGdpdHMgUHR5IEx0ZDEiMCAGA1UEAwwZcHJvbWV0aGV1cy5z
ZXJ2aWNlLmNvbnN1bDAeFw0yMjEyMzEyMDMxMDFaFw0yMzAxMzAyMDMxMDFaMGkx
CzAJBgNVBAYTAlNLMRMwEQYDVQQIDApTb21lLVN0YXRlMSEwHwYDVQQKDBhJbnRl
cm5ldCBXaWRnaXRzIFB0eSBMdGQxIjAgBgNVBAMMGXByb21ldGhldXMuc2Vydmlj
ZS5jb25zdWwwggIiMA0GCSqGSIb3DQEBAQUAA4ICDwAwggIKAoICAQCGH4Tyj+9G
wYJNb3ubIdr5r0/DZL6XEnRIMiz88TN2QmdwAGKyQqQd7ka0IkdDHPhpRuK8IV1g
ELQhKab7YJCa6zWuy+rQ6JFlotGC+2tIXd3MDriUd1VPVoX6fw/5zUK/2j6exBk4
iqxPXHchQLzZ0viUXhQIBS1IUMTbfc0vjA8U0uPgpmAR7ieePWFwmUDxjOLMvJw6
+goeOfaHhW4yYgT+kg7L3rT62G+KG6Op/p7k7BNZ6G6Y6K6uJ7Z/AayAClF2sPZz
UIGr0uEDvD4IcAsfQgpR5vK/SVBFU5+DSO68mm11m+8IH/HA6GvNSEvCRC0Wtrsm
Dyq+9S3wZ7tNi7msjQWWKTB1GvTbCbPE1G/q5GJdoKUnioys6AMP4DTEV9o3lCSg
0sjYnkSTKgRplnuY/7Y2qSNnD1Rw0ZneSkF+8ocgiYcTvtyOY2fkhlT2VaQLX987
m7892ikPvoCnc/LVeREWW7hCuIQ1E1CCqg304Kd9gCgKoOGXoYmC/3wgJW0RkaM0
x5DpNLYx0y11CPVg315dvprOuedap6J3CNhBE3fO8ymwepFTzTcWLWgSVWrRLZnx
Lgb4SPhjxPg6cCZptkmXrPA+9SgW8iNHd/Fer6MAs82Kcp2T1C+qq9RurL/jjxTD
JaFrwZC2lgWELToMyVDrkBJJbA/2cU9CMQIDAQABo1MwUTAdBgNVHQ4EFgQUx1Mi
fylZExNnIz0EkrPRdXYmHmAwHwYDVR0jBBgwFoAUx1MifylZExNnIz0EkrPRdXYm
HmAwDwYDVR0TAQH/BAUwAwEB/zANBgkqhkiG9w0BAQsFAAOCAgEAbvlpMg4YRTNe
0cgqMZky/GpNjvE/zFManUGgYns8TKyZ8U0laBxRQ4XU/fASwAcOBJYtrkG7w8z+
FaOUptaOlNGW1VWsPDJt8ZQ2gAcTwKSW2EsBWCmOUJVNH5F0f6fTSqIUIXyxhP2w
JVniSkfarhb/Y1EDCACdr7Xpu6iF+nQo2o4/HE4Wkto4qwvlrdApYv4dl5J1TWjq
72fO9axDlNnEGVxa3C3xvKOQqWrEUy/HqC9p4it1yCiq6IYVLyve0meVFBY9xNXU
137AN7ks4ouuR1FZQkhLtqFuIekSZ5l4G4alwdv1NB8vohJMuMJyk9DarTLqXcYU
1uypZSmgREn8ByYrj4ochkSpiPw7wgK4H1Aa2cy4KUuzmLLShYu6Mov7hyJDoJSe
JsDVNoEBuhql4jENATqbWT3pIgYwBvBEXuYXqekcNmVZkKiSOlsxKFfSz21HYDgA
lCu4SMtlRYHcm4TuoTuy/FEPxHSjFY3pMciJrnO/qUrv9LlWPe1wjKhZLRPEebTk
r+Oh+aVWpy3ps7shPTjczOrmQykWWBGAjndZjZi4VvZNRxkGZuNwzzZcEkzt0Db7
l83pTRD58mvLHWl2QXoBS3t7IM6sOMwQvPx1Inp7hb7UIpNsJQaUrhhfKqy0sK18
mXs4VRtrxYycXxsLbk0SaZGh+juT53M=
-----END CERTIFICATE-----
EOH
      }

      template {
        change_mode     = "noop"
        change_signal   = "SIGINT"
        destination     = "secrets/key_file.key"
        left_delimiter  = "{{{"
        right_delimiter = "}}}"
        data            = <<EOH
-----BEGIN PRIVATE KEY-----
MIIJQQIBADANBgkqhkiG9w0BAQEFAASCCSswggknAgEAAoICAQCGH4Tyj+9GwYJN
b3ubIdr5r0/DZL6XEnRIMiz88TN2QmdwAGKyQqQd7ka0IkdDHPhpRuK8IV1gELQh
Kab7YJCa6zWuy+rQ6JFlotGC+2tIXd3MDriUd1VPVoX6fw/5zUK/2j6exBk4iqxP
XHchQLzZ0viUXhQIBS1IUMTbfc0vjA8U0uPgpmAR7ieePWFwmUDxjOLMvJw6+goe
OfaHhW4yYgT+kg7L3rT62G+KG6Op/p7k7BNZ6G6Y6K6uJ7Z/AayAClF2sPZzUIGr
0uEDvD4IcAsfQgpR5vK/SVBFU5+DSO68mm11m+8IH/HA6GvNSEvCRC0WtrsmDyq+
9S3wZ7tNi7msjQWWKTB1GvTbCbPE1G/q5GJdoKUnioys6AMP4DTEV9o3lCSg0sjY
nkSTKgRplnuY/7Y2qSNnD1Rw0ZneSkF+8ocgiYcTvtyOY2fkhlT2VaQLX987m789
2ikPvoCnc/LVeREWW7hCuIQ1E1CCqg304Kd9gCgKoOGXoYmC/3wgJW0RkaM0x5Dp
NLYx0y11CPVg315dvprOuedap6J3CNhBE3fO8ymwepFTzTcWLWgSVWrRLZnxLgb4
SPhjxPg6cCZptkmXrPA+9SgW8iNHd/Fer6MAs82Kcp2T1C+qq9RurL/jjxTDJaFr
wZC2lgWELToMyVDrkBJJbA/2cU9CMQIDAQABAoICAA5AQByT3Z07h3BZ5ZzUqpM4
JPYCeNvNeqyHJE+WA11P7fSxHcuKGC0T+dA/Cipf5CcvgHzz4JuJ+tHBPrxcBNFp
J5GUmjUrWPOfKrrLoxkT3DLH56Xizh45d8/ne1eUD0EaW+f7tyBSX7+o+AGBAu/0
IjSFkIRPpIGYD2qxAcHJFHsmc08V7oRJNU1zgSx5JDTmPtz5N3Juye9vQjohG9Xf
o183Pro7xigXIjbe+/NemhyB1waJE2NM6e6YSqRRFbafIgvF/tG+3qBWrlD6ye6U
lSHznuwX6XgYvp43Je5JrBA/Kl1CPdIzrrjMGVQ9F8ui+dV9ggInv2d93q06IGUU
D1o9XsZivYkn1EkLEhFXD5CYj6oR1M+MyvUrBD0bJePQCBUo+WJ2sEDt9PN2AtFL
9j7NKK/xXX5cTdAajeIvSS1PUGAHi7r1OF/c7bn3UFNOuOBEYzLsSZGP34AVglor
NON0ENCTuylmDSFd8vpaKFQpV5SK3M2k8dPRe7VEu2C9UlRvAq0xnabSHNxbwNLU
KuGDMSCKDc2npf3oCeQKU2PngAcePnwWSiapAkf5OqltQ/vMbrEpROpfzXLlRxLZ
76MDMFMQkT7m0hik6aPBHTitcWRalxHhK0ze8GvO0wesIBdyYShPKg+VDNg3qFMm
epVXzoi8xNzW8S6yi9DJAoIBAQC2l90VF5evDsv8nwsWMIa/rFGGLitpw23+oNcZ
xsIDMsGie06GYwzYHNRsd3sqK5TNLtl2vJGaVNbeDcC5T22NAYPRjNas7I5svIki
SnT4K68ICIVVxtfETbh2qoXSu+O3pyWJmHqqcQrvW2DlUvs0nxk/v3GukFjTVbuU
qmXp1KjPAVMNYoWNCJkHLEpq6e3K3q4YhEImGhMbN8suvVR9+fkKx8QvKHcqT2kn
9AlK7t57IPqovbni9KMfMZ+wPqw6HsYTL8lQE5NaqMB5q9Pl3SnzcRR0FSadNAiD
/W9jWyMazE0UsNDn241X81tVlU78Kx9S/IN97m/FSeDA1XudAoIBAQC8CzVeHxTw
U+ts/fi1XEuWOph2cIm6qd4aiyGX/riux0O6GUFuIQkosP5StWJyNPLBohWHC6eq
hPk7b0vPWmxuhttUPLA/+6+CICC0jEMWvnDAd5aJULfT0pTLZyizVu2f/GbVaiL6
pgsqeGyKnuh9cNTW5w7Mc45fXkgyKrB4W5aPfjoHN51n+jUqaDrfrp3CoWFviNDn
n3WNFtgrkj/jzQM8XFixhwxADfjd8+sZVmHT4GYjIDS4pCqs5gtIZYKhXDb0Dydj
fH/HiEXC63z0SuFjGNbomC/Era7kI3+1aK2qs6dyASzZKDN6dHKYoalHReUe/Cxk
prRcyYRWhA6lAoIBAEVrLy5Zrd1sLrl4beqdwF0W0lfFLdQj7Kml1KGEIza8EUoI
vy3wcm2naEtkkXrS3tuzOBIgVurp3lbFu8O4Ito8/TSp6uQLe4pzk19qF1ZSpVTU
iHy4AEgtlDfpVL9tl4G3FlpdkiVCnPmrMAd/qOm0oxDNZBcN4fdW3N4EeoKPyy4I
Pt8T2dpormU/vXswPKuoRWAkyFFcEG+Eosa+TGUoqDolAL09ETEQx9XcvbuzXPpK
64FDwGw8vdeaMi/7Y9ck5AFfZZYAG0GYbrTTUthNYSmgkDoh4HBb2/DyZWrMt2f0
zElVf9bmbbJGXy8GeOT+MAaI4iT6hZvoHn6xqzECggEABoQg6k0LbbSKwPEgEDDN
kbwgEmKd8zD1uFe/50N1ZOEU0LsVUFqmtZlEhtswOSLqkpkqQ868laUb+dpGdz37
6eyUZxvfQ6hWEZ1JZNhDbuNUhubd+Y4pgJaYf1/owiYt/9BAQ/70jVj5pBQeNsOA
7O/fAD9rfNw4P8fFmq9uBA2wbvKB0kQ0GSlLdFe+SogDgX4UIUhNbOlSqnvzK7da
rWsqRIoyrJwwaXvSduZ/7BXZN/1brLXt/cP6kpk6JN0XpL3MTbLEu6bRyrlHKZT9
dH2vx75RnCfB5//YwqEUSNYCxpqJH+M4iaHh/slQO0fG1OhwIx278BTyxRBanKDg
3QKCAQBoVnM3PDqaSAT1g3f3neYiXyZektJganRLj5wmDXYAySM2ag/oDacswmP/
J0BQ9KYK+dSgXldlaXtC05oxdhxY5cawbCFNfbjGDZ6zGwgLDocyFtqOBZf6UXCV
Gtj/9r6iyD2/2wbo/lrS0d3yNcNN0nkZUxoyl+J6uGB1o8bo+cfL+mi4pkALKV8L
Oa/fPazAQtikZBHSWtdQamyUMFSAdMUeYIhaXBfkNUZG4sz9nKD5UGBOmquLMBt6
zBPM+4dv4x/MEAEnSC2ANW8vDGFBgG/5H5+j2F0RM6O1MlkDzrOAIvUTrMJlJDBt
775JbZNCKpaELqxy4BNPfRDEJGBh
-----END PRIVATE KEY-----
EOH
      }

      # The "template" stanza instructs Nomad to manage a template, such as
      # a configuration file or script. This template can optionally pull data
      # from Consul or Vault to populate runtime configuration data.
      #
      #     https://www.nomadproject.io/docs/job-specification/template
      #
      template {
        change_mode     = "noop"
        change_signal   = "SIGINT"
        destination     = "secrets/alerts.yml"
        left_delimiter  = "{{{"
        right_delimiter = "}}}"
        data            = <<EOH
---
groups:
- name: "Jenkins Job Health Exporter"
  rules:
  - alert: JenkinsJobHealthExporterFailures
    expr: jenkins_job_failure{id=~".*"} > jenkins_job_success{id=~".*"}
    for: 0m
    labels:
      severity: critical
    annotations:
      summary: "Jenkins Job Health detected high failure rate on jenkins jobs."
      description: "Job: {{ $labels.id }}"
  - alert: JenkinsJobHealthExporterUnstable
    expr: jenkins_job_unstable{id=~".*"} > jenkins_job_success{id=~".*"}
    for: 0m
    labels:
      severity: warning
    annotations:
      summary: "Jenkins Job Health detected high unstable rate on jenkins jobs."
      description: "Job: {{ $labels.id }}"
- name: "Consul"
  rules:
  - alert: ConsulServiceHealthcheckFailed
    expr: consul_catalog_service_node_healthy == 0
    for: 0m
    labels:
      severity: critical
    annotations:
      summary: "Consul service healthcheck failed (instance {{ $labels.instance }})."
      description: "Service: `{{ $labels.service_name }}` Healthcheck: `{{ $labels.service_id }}`."
  - alert: ConsulMissingMasterNode
    expr: consul_raft_peers < 3
    for: 0m
    labels:
      severity: critical
    annotations:
      summary: "Consul missing master node (instance {{ $labels.instance }})."
      description: "Numbers of consul raft peers should be 3, in order to preserve quorum."
  - alert: ConsulAgentUnhealthy
    expr: consul_health_node_status{status="critical"} == 1
    for: 0m
    labels:
      severity: critical
    annotations:
      summary: "Consul agent unhealthy (instance {{ $labels.instance }})."
      description: "A Consul agent is down."
- name: "Hosts"
  rules:
  - alert: NodeDown
    expr: up == 0
    for: 0m
    labels:
      severity: critical
    annotations:
      summary: "Prometheus target missing (instance {{ $labels.instance }})."
      description: "A Prometheus target has disappeared. An exporter might be crashed."
  - alert: HostOutOfMemory
    expr: node_memory_MemAvailable_bytes / node_memory_MemTotal_bytes * 100 < 10
    for: 2m
    labels:
      severity: warning
    annotations:
      summary: "Host out of memory (instance {{ $labels.instance }})."
      description: "Node memory is filling up (< 10% left)."
  - alert: HostOomKillDetected
    expr: increase(node_vmstat_oom_kill[1m]) > 0
    for: 0m
    labels:
      severity: warning
    annotations:
      summary: "Host OOM kill detected (instance {{ $labels.instance }})."
      description: "OOM kill detected."
  - alert: HostMemoryUnderMemoryPressure
    expr: rate(node_vmstat_pgmajfault[1m]) > 1000
    for: 2m
    labels:
      severity: warning
    annotations:
      summary: "Host memory under memory pressure (instance {{ $labels.instance }})."
      description: "The node is under heavy memory pressure. High rate of major page faults."
  - alert: HostOutOfDiskSpace
    expr: (node_filesystem_avail_bytes * 100) / node_filesystem_size_bytes < 10 and ON (instance, device, mountpoint) node_filesystem_readonly == 0
    for: 2m
    labels:
      severity: warning
    annotations:
      summary: "Host out of disk space (instance {{ $labels.instance }})."
      description: "Disk is almost full (< 10% left)."
  - alert: HostRaidDiskFailure
    expr: node_md_disks{state="failed"} > 0
    for: 2m
    labels:
      severity: warning
    annotations:
      summary: "Host RAID disk failure (instance {{ $labels.instance }})."
      description: "At least one device in RAID array on {{ $labels.instance }} failed. Array {{ $labels.md_device }} needs attention and possibly a disk swap."
  - alert: HostConntrackLimit
    expr: node_nf_conntrack_entries / node_nf_conntrack_entries_limit > 0.8
    for: 5m
    labels:
      severity: warning
    annotations:
      summary: "Host conntrack limit (instance {{ $labels.instance }})."
      description: "The number of conntrack is approching limit."
  - alert: HostNetworkInterfaceSaturated
    expr: (rate(node_network_receive_bytes_total{device!~"^tap.*"}[1m]) + rate(node_network_transmit_bytes_total{device!~"^tap.*"}[1m])) / node_network_speed_bytes{device!~"^tap.*"} > 0.8
    for: 1m
    labels:
      severity: warning
    annotations:
      summary: "Host Network Interface Saturated (instance {{ $labels.instance }})."
      description: "The network interface {{ $labels.interface }} on {{ $labels.instance }} is getting overloaded."
  - alert: HostSystemdServiceCrashed
    expr: node_systemd_unit_state{state="failed"} == 1
    for: 0m
    labels:
      severity: warning
    annotations:
      summary: "Host SystemD service crashed (instance {{ $labels.instance }})."
      description: "SystemD service crashed."
  - alert: HostEdacCorrectableErrorsDetected
    expr: increase(node_edac_correctable_errors_total[1m]) > 0
    for: 0m
    labels:
      severity: info
    annotations:
      summary: "Host EDAC Correctable Errors detected (instance {{ $labels.instance }})."
      description: '{{ $labels.instance }} has had {{ printf "%.0f" $value }} correctable memory errors reported by EDAC in the last 5 minutes.'
  - alert: HostEdacUncorrectableErrorsDetected
    expr: node_edac_uncorrectable_errors_total > 0
    for: 0m
    labels:
      severity: warning
    annotations:
      summary: "Host EDAC Uncorrectable Errors detected (instance {{ $labels.instance }})."
      description: '{{ $labels.instance }} has had {{ printf "%.0f" $value }} uncorrectable memory errors reported by EDAC in the last 5 minutes.'
- name: "Prometheus"
  rules:
  - alert: PrometheusConfigurationReloadFailure
    expr: prometheus_config_last_reload_successful != 1
    for: 0m
    labels:
      severity: warning
    annotations:
      summary: "Prometheus configuration reload failure (instance {{ $labels.instance }})."
      description: "Prometheus configuration reload error."
  - alert: PrometheusTooManyRestarts
    expr: changes(process_start_time_seconds{job=~"prometheus|pushgateway|alertmanager"}[15m]) > 2
    for: 0m
    labels:
      severity: warning
    annotations:
      summary: "Prometheus too many restarts (instance {{ $labels.instance }})."
      description: "Prometheus has restarted more than twice in the last 15 minutes. It might be crashlooping."
  - alert: PrometheusAlertmanagerConfigurationReloadFailure
    expr: alertmanager_config_last_reload_successful != 1
    for: 0m
    labels:
      severity: warning
    annotations:
      summary: "Prometheus AlertManager configuration reload failure (instance {{ $labels.instance }})."
      description: "AlertManager configuration reload error."
  - alert: PrometheusRuleEvaluationFailures
    expr: increase(prometheus_rule_evaluation_failures_total[3m]) > 0
    for: 0m
    labels:
      severity: critical
    annotations:
      summary: "Prometheus rule evaluation failures (instance {{ $labels.instance }})."
      description: "Prometheus encountered {{ $value }} rule evaluation failures, leading to potentially ignored alerts."
  - alert: PrometheusTargetScrapingSlow
    expr: prometheus_target_interval_length_seconds{quantile="0.9"} > 60
    for: 5m
    labels:
      severity: warning
    annotations:
      summary: "Prometheus target scraping slow (instance {{ $labels.instance }})."
      description: "Prometheus is scraping exporters slowly."
  - alert: PrometheusTsdbCompactionsFailed
    expr: increase(prometheus_tsdb_compactions_failed_total[1m]) > 0
    for: 0m
    labels:
      severity: critical
    annotations:
      summary: "Prometheus TSDB compactions failed (instance {{ $labels.instance }})."
      description: "Prometheus encountered {{ $value }} TSDB compactions failures."
  - alert: PrometheusTsdbHeadTruncationsFailed
    expr: increase(prometheus_tsdb_head_truncations_failed_total[1m]) > 0
    for: 0m
    labels:
      severity: critical
    annotations:
      summary: "Prometheus TSDB head truncations failed (instance {{ $labels.instance }})."
      description: "Prometheus encountered {{ $value }} TSDB head truncation failures."
  - alert: PrometheusTsdbWalCorruptions
    expr: increase(prometheus_tsdb_wal_corruptions_total[1m]) > 0
    for: 0m
    labels:
      severity: critical
    annotations:
      summary: "Prometheus TSDB WAL corruptions (instance {{ $labels.instance }})."
      description: "Prometheus encountered {{ $value }} TSDB WAL corruptions."
  - alert: PrometheusTsdbWalTruncationsFailed
    expr: increase(prometheus_tsdb_wal_truncations_failed_total[1m]) > 0
    for: 0m
    labels:
      severity: critical
    annotations:
      summary: "Prometheus TSDB WAL truncations failed (instance {{ $labels.instance }})."
      description: "Prometheus encountered {{ $value }} TSDB WAL truncation failures."
EOH
      }

      template {
        change_mode     = "noop"
        change_signal   = "SIGINT"
        destination     = "secrets/prometheus.yml"
        data            = <<EOH
---
global:
  scrape_interval:     5s
  scrape_timeout:      5s
  evaluation_interval: 5s

alerting:
  alertmanagers:
  - consul_sd_configs:
    - server: '{{ env "NOMAD_IP_prometheus" }}:8500'
      services: [ 'alertmanager' ]

rule_files:
  - 'alerts.yml'

scrape_configs:
  - job_name: 'Nomad Cluster'
    consul_sd_configs:
    - server: '{{ env "NOMAD_IP_prometheus" }}:8500'
      services: [ 'nomad-client', 'nomad' ]
    relabel_configs:
    - source_labels: [__meta_consul_tags]
      regex: '(.*)http(.*)'
      action: keep
    metrics_path: /v1/metrics
    params:
      format: [ 'prometheus' ]

  - job_name: 'Consul Cluster'
    static_configs:
      - targets: [ '10.30.51.23:8500' ]
      - targets: [ '10.30.51.24:8500' ]
      - targets: [ '10.30.51.25:8500' ]
      - targets: [ '10.30.51.26:8500' ]
      - targets: [ '10.30.51.27:8500' ]
      - targets: [ '10.30.51.28:8500' ]
      - targets: [ '10.30.51.50:8500' ]
      - targets: [ '10.30.51.51:8500' ]
      - targets: [ '10.30.51.70:8500' ]
      - targets: [ '10.30.51.71:8500' ]
      - targets: [ '10.30.51.91:8500' ]
      - targets: [ '10.30.51.92:8500' ]
    metrics_path: /v1/agent/metrics
    params:
      format: [ 'prometheus' ]

  - job_name: 'Jenkins Job Health Exporter'
    static_configs:
      - targets: [ '10.30.51.22:9186' ]
    metric_relabel_configs:
      - source_labels: [ __name__ ]
        regex: '^(vpp.*|csit.*)_(success|failure|total|unstable|reqtime_ms)$'
        action: replace
        replacement: '$1'
        target_label: id
      - source_labels: [ __name__ ]
        regex: '^(vpp.*|csit.*)_(success|failure|total|unstable|reqtime_ms)$'
        replacement: 'jenkins_job_$2'
        target_label: __name__

  - job_name: 'Node Exporter'
    static_configs:
      - targets: [ '10.30.51.23:9100' ]
      - targets: [ '10.30.51.24:9100' ]
      - targets: [ '10.30.51.25:9100' ]
      - targets: [ '10.30.51.26:9100' ]
      - targets: [ '10.30.51.27:9100' ]
      - targets: [ '10.30.51.28:9100' ]
      - targets: [ '10.30.51.50:9100' ]
      - targets: [ '10.30.51.51:9100' ]
      - targets: [ '10.30.51.70:9100' ]
      - targets: [ '10.30.51.71:9100' ]
      - targets: [ '10.30.51.91:9100' ]
      - targets: [ '10.30.51.92:9100' ]

  - job_name: 'Alertmanager'
    consul_sd_configs:
    - server: '{{ env "NOMAD_IP_prometheus" }}:8500'
      services: [ 'alertmanager' ]

  - job_name: 'Prometheus'
    honor_timestamps: true
    params:
      format:
      - prometheus
    scheme: https
    follow_redirects: true
    enable_http2: true
    consul_sd_configs:
    - server: {{ env "CONSUL_HTTP_ADDR" }}
      services:
      - prometheus
    tls_config:
      cert_file: cert_file.crt
      key_file: key_file.key
      insecure_skip_verify: true
EOH
      }

      template {
        change_mode     = "noop"
        change_signal   = "SIGINT"
        destination     = "secrets/web-config.yml"
        left_delimiter  = "{{{"
        right_delimiter = "}}}"
        data            = <<EOH
---
tls_server_config:
  cert_file: cert_file.crt
  key_file: key_file.key
EOH
      }

      # The service stanza instructs Nomad to register a service with Consul.
      #
      # https://www.nomadproject.io/docs/job-specification/service
      #
      service {
        name       = "${service_name}"
        port       = "${service_name}"
        tags       = [ "${service_name}$${NOMAD_ALLOC_INDEX}" ]
        check {
          name            = "Prometheus Check Live"
          type            = "http"
          path            = "/-/healthy"
          protocol        = "https"
          tls_skip_verify = true
          interval        = "10s"
          timeout         = "2s"
        }
      }

      # The "resources" stanza describes the requirements a task needs to
      # execute. Resource requirements include memory, network, cpu, and more.
      # This ensures the task will execute on a machine that contains enough
      # resource capacity.
      #
      # https://www.nomadproject.io/docs/job-specification/resources
      #
      resources {
        cpu    = ${cpu}
        memory = ${memory}
      }
    }
  }
}