From 3b0fd814e4925d4918a3209682d4ef6f5ab3c8e1 Mon Sep 17 00:00:00 2001 From: pmikus Date: Tue, 14 Feb 2023 08:02:04 +0000 Subject: feat(terraform): Update Prometheus Signed-off-by: pmikus Change-Id: I968d713b93ca2ac917cc9f9b9299d6373fbb87db --- .../conf/nomad/prometheus.hcl.tftpl | 262 ++++++++++++++------- .../terraform-nomad-prometheus/fdio/main.tf | 2 +- .../terraform-nomad-prometheus/fdio/versions.tf | 10 +- .../terraform-nomad-prometheus/main.tf | 8 +- .../terraform-nomad-prometheus/variables.tf | 14 +- .../terraform-nomad-prometheus/versions.tf | 4 +- .../terraform-nomad-pyspark-etl/fdio/variables.tf | 4 +- .../terraform-nomad-pyspark-etl/fdio/versions.tf | 2 +- .../terraform-nomad-pyspark-etl/versions.tf | 4 +- 9 files changed, 211 insertions(+), 99 deletions(-) diff --git a/fdio.infra.terraform/terraform-nomad-prometheus/conf/nomad/prometheus.hcl.tftpl b/fdio.infra.terraform/terraform-nomad-prometheus/conf/nomad/prometheus.hcl.tftpl index e3c508dd32..4eb4428988 100644 --- a/fdio.infra.terraform/terraform-nomad-prometheus/conf/nomad/prometheus.hcl.tftpl +++ b/fdio.infra.terraform/terraform-nomad-prometheus/conf/nomad/prometheus.hcl.tftpl @@ -8,18 +8,15 @@ job "${job_name}" { datacenters = "${datacenters}" # The "type" parameter controls the type of job, which impacts the scheduler's - # decision on placement. This configuration is optional and defaults to - # "service". For a full list of job types and their differences, please see - # the online documentation. + # decision on placement. # - # https://www.nomadproject.io/docs/jobspec/schedulers + # https://www.nomadproject.io/docs/jobspec/schedulers # type = "service" update { # The "max_parallel" parameter specifies the maximum number of updates to - # perform in parallel. In this case, this specifies to update a single task - # at a time. + # perform in parallel. max_parallel = ${max_parallel} health_check = "checks" @@ -73,12 +70,11 @@ job "${job_name}" { # the same Nomad client. Any task within a group will be placed on the same # client. # - # https://www.nomadproject.io/docs/job-specification/group + # https://www.nomadproject.io/docs/job-specification/group # group "${job_name}-group-1" { # The "count" parameter specifies the number of the task groups that should - # be running under this group. This value must be non-negative and defaults - # to 1. + # be running under this group. This value must be non-negative. count = ${group_count} # The volume stanza allows the group to specify that it requires a given @@ -86,6 +82,7 @@ job "${job_name}" { # as it will be exposed to task configuration. # # https://www.nomadproject.io/docs/job-specification/volume + # %{ if use_host_volume } volume "${job_name}-volume-1" { type = "host" @@ -100,23 +97,22 @@ job "${job_name}" { # https://www.nomadproject.io/docs/job-specification/restart # restart { - interval = "30m" - attempts = 40 - delay = "15s" - mode = "delay" + interval = "30m" + attempts = 40 + delay = "15s" + mode = "delay" } # The constraint allows restricting the set of eligible nodes. Constraints # may filter on attributes or client metadata. # - # https://www.nomadproject.io/docs/job-specification/constraint + # https://www.nomadproject.io/docs/job-specification/constraint # constraint { attribute = "$${attr.cpu.arch}" operator = "!=" value = "arm64" } - constraint { attribute = "$${node.class}" value = "builder" @@ -129,7 +125,7 @@ job "${job_name}" { # your job will be provisioned on, Nomad will provide your tasks with # network configuration when they start up. # - # https://www.nomadproject.io/docs/job-specification/network + # https://www.nomadproject.io/docs/job-specification/network # network { port "${service_name}" { @@ -141,49 +137,164 @@ job "${job_name}" { # The "task" stanza creates an individual unit of work, such as a Docker # container, web application, or batch processing. # - # https://www.nomadproject.io/docs/job-specification/task + # https://www.nomadproject.io/docs/job-specification/task # task "${job_name}-task-1" { # The "driver" parameter specifies the task driver that should be used to # run the task. driver = "exec" - %{ if use_host_volume } + %{ if use_host_volume } volume_mount { volume = "${job_name}-volume-1" destination = "${volume_destination}" read_only = false } - %{ endif } + %{ endif } - %{ if use_vault_provider } + %{ if use_vault_provider } vault { policies = "${vault_kv_policy_name}" } - %{ endif } + %{ endif } # The "config" stanza specifies the driver configuration, which is passed # directly to the driver to start the task. The details of configurations # are specific to each driver, so please see specific driver # documentation for more information. config { - command = "local/prometheus-${version}.linux-amd64/prometheus" - args = [ + command = "local/prometheus-${version}.linux-amd64/prometheus" + args = [ "--config.file=secrets/prometheus.yml", + "--web.config.file=secrets/web-config.yml", "--storage.tsdb.path=${volume_destination}prometheus/", "--storage.tsdb.retention.time=7d" ] } - # The artifact stanza instructs Nomad to fetch and unpack a remote resource, - # such as a file, tarball, or binary. Nomad downloads artifacts using the - # popular go-getter library, which permits downloading artifacts from a - # variety of locations using a URL as the input source. + # The artifact stanza instructs Nomad to fetch and unpack a remote + # resource, such as a file, tarball, or binary. Nomad downloads artifacts + # using the popular go-getter library, which permits downloading artifacts + # from a variety of locations using a URL as the input source. # - # https://www.nomadproject.io/docs/job-specification/artifact + # https://www.nomadproject.io/docs/job-specification/artifact # artifact { - source = "${url}" + source = "${artifact_source}" + options { + checksum = "sha256:${artifact_source_checksum}" + } + } + + # The "template" stanza instructs Nomad to manage a template, such as + # a configuration file or script. This template can optionally pull data + # from Consul or Vault to populate runtime configuration data. + # + # https://www.nomadproject.io/docs/job-specification/template + # + template { + change_mode = "noop" + change_signal = "SIGINT" + destination = "secrets/cert_file.crt" + left_delimiter = "{{{" + right_delimiter = "}}}" + data = < 0 - for: 0m - labels: - severity: critical - annotations: - summary: "Minio disk offline (instance {{ $labels.instance }})" - description: "Minio disk is offline." - - alert: MinioStorageSpaceExhausted - expr: minio_disk_storage_free_bytes / 1024 / 1024 / 1024 < 10 - for: 2m - labels: - severity: warning - annotations: - summary: "Minio storage space exhausted (instance {{ $labels.instance }})." - description: "Minio storage space is low (< 10 GB)." - name: "Prometheus" rules: - alert: PrometheusConfigurationReloadFailure @@ -451,7 +544,6 @@ rule_files: - 'alerts.yml' scrape_configs: - - job_name: 'Nomad Cluster' consul_sd_configs: - server: '{{ env "NOMAD_IP_prometheus" }}:8500' @@ -466,17 +558,12 @@ scrape_configs: - job_name: 'Consul Cluster' static_configs: - - targets: [ '10.30.51.16:8500' ] - - targets: [ '10.30.51.17:8500' ] - - targets: [ '10.30.51.18:8500' ] - - targets: [ '10.30.51.19:8500' ] - - targets: [ '10.30.51.20:8500' ] - - targets: [ '10.30.51.21:8500' ] - - targets: [ '10.30.51.22:8500' ] - targets: [ '10.30.51.23:8500' ] - targets: [ '10.30.51.24:8500' ] - targets: [ '10.30.51.25:8500' ] - targets: [ '10.30.51.26:8500' ] + - targets: [ '10.30.51.27:8500' ] + - targets: [ '10.30.51.28:8500' ] - targets: [ '10.30.51.50:8500' ] - targets: [ '10.30.51.51:8500' ] - targets: [ '10.30.51.70:8500' ] @@ -503,17 +590,12 @@ scrape_configs: - job_name: 'Node Exporter' static_configs: - - targets: [ '10.30.51.16:9100' ] - - targets: [ '10.30.51.17:9100' ] - - targets: [ '10.30.51.18:9100' ] - - targets: [ '10.30.51.19:9100' ] - - targets: [ '10.30.51.20:9100' ] - - targets: [ '10.30.51.21:9100' ] - - targets: [ '10.30.51.22:9100' ] - targets: [ '10.30.51.23:9100' ] - targets: [ '10.30.51.24:9100' ] - targets: [ '10.30.51.25:9100' ] - targets: [ '10.30.51.26:9100' ] + - targets: [ '10.30.51.27:9100' ] + - targets: [ '10.30.51.28:9100' ] - targets: [ '10.30.51.50:9100' ] - targets: [ '10.30.51.51:9100' ] - targets: [ '10.30.51.70:9100' ] @@ -526,39 +608,55 @@ scrape_configs: - server: '{{ env "NOMAD_IP_prometheus" }}:8500' services: [ 'alertmanager' ] - - job_name: 'Grafana' - consul_sd_configs: - - server: '{{ env "NOMAD_IP_prometheus" }}:8500' - services: [ 'grafana' ] - - job_name: 'Prometheus' + honor_timestamps: true + params: + format: + - prometheus + scheme: https + follow_redirects: true + enable_http2: true consul_sd_configs: - - server: '{{ env "NOMAD_IP_prometheus" }}:8500' - services: [ 'prometheus' ] + - server: {{ env "CONSUL_HTTP_ADDR" }} + services: + - prometheus + tls_config: + cert_file: cert_file.crt + key_file: key_file.key + insecure_skip_verify: true +EOH + } - - job_name: 'Minio' - bearer_token: eyJhbGciOiJIUzUxMiIsInR5cCI6IkpXVCJ9.eyJleHAiOjQ3NjQ1ODEzMzcsImlzcyI6InByb21ldGhldXMiLCJzdWIiOiJtaW5pbyJ9.oeTw3EIaiFmlDikrHXWiWXMH2vxLfDLkfjEC7G2N3M_keH_xyA_l2ofLLNYtopa_3GCEZnxLQdPuFZrmgpkDWg - consul_sd_configs: - - server: '{{ env "NOMAD_IP_prometheus" }}:8500' - services: [ 'storage' ] - metrics_path: /minio/prometheus/metrics + template { + change_mode = "noop" + change_signal = "SIGINT" + destination = "secrets/web-config.yml" + left_delimiter = "{{{" + right_delimiter = "}}}" + data = <