diff options
author | Peter Mikus <pmikus@cisco.com> | 2022-02-09 11:07:53 +0100 |
---|---|---|
committer | Peter Mikus <pmikus@cisco.com> | 2022-02-09 11:07:53 +0100 |
commit | bc676c9e1e5ef3545ef442db2023d7fcdfe3b034 (patch) | |
tree | 3b9e7081d0c30138441785fcf57b9c149469a0cc /fdio.infra.terraform/1n_nmd/prometheus | |
parent | 0bbb81c4fd1afdee6eb23ba4d49171d8dced6b19 (diff) |
feat(terraform): Refactor Prometheus
Signed-off-by: Peter Mikus <pmikus@cisco.com>
Change-Id: I47a05fc0207d691d84c6e65c7b84997edab8e272
Diffstat (limited to 'fdio.infra.terraform/1n_nmd/prometheus')
8 files changed, 832 insertions, 77 deletions
diff --git a/fdio.infra.terraform/1n_nmd/prometheus/conf/nomad/prometheus.hcl.tftpl b/fdio.infra.terraform/1n_nmd/prometheus/conf/nomad/prometheus.hcl.tftpl new file mode 100644 index 0000000000..224f7e5e00 --- /dev/null +++ b/fdio.infra.terraform/1n_nmd/prometheus/conf/nomad/prometheus.hcl.tftpl @@ -0,0 +1,624 @@ +job "${job_name}" { + # The "region" parameter specifies the region in which to execute the job. + # If omitted, this inherits the default region name of "global". + # region = "${region}" + + # The "datacenters" parameter specifies the list of datacenters which should + # be considered when placing this task. This must be provided. + datacenters = "${datacenters}" + + # The "type" parameter controls the type of job, which impacts the scheduler's + # decision on placement. This configuration is optional and defaults to + # "service". For a full list of job types and their differences, please see + # the online documentation. + # + # https://www.nomadproject.io/docs/jobspec/schedulers + # + type = "service" + + update { + # The "max_parallel" parameter specifies the maximum number of updates to + # perform in parallel. In this case, this specifies to update a single task + # at a time. + max_parallel = ${max_parallel} + + health_check = "checks" + + # The "min_healthy_time" parameter specifies the minimum time the allocation + # must be in the healthy state before it is marked as healthy and unblocks + # further allocations from being updated. + min_healthy_time = "10s" + + # The "healthy_deadline" parameter specifies the deadline in which the + # allocation must be marked as healthy after which the allocation is + # automatically transitioned to unhealthy. Transitioning to unhealthy will + # fail the deployment and potentially roll back the job if "auto_revert" is + # set to true. + healthy_deadline = "3m" + + # The "progress_deadline" parameter specifies the deadline in which an + # allocation must be marked as healthy. The deadline begins when the first + # allocation for the deployment is created and is reset whenever an allocation + # as part of the deployment transitions to a healthy state. If no allocation + # transitions to the healthy state before the progress deadline, the + # deployment is marked as failed. + progress_deadline = "10m" + +%{ if use_canary } + # The "canary" parameter specifies that changes to the job that would result + # in destructive updates should create the specified number of canaries + # without stopping any previous allocations. Once the operator determines the + # canaries are healthy, they can be promoted which unblocks a rolling update + # of the remaining allocations at a rate of "max_parallel". + # + # Further, setting "canary" equal to the count of the task group allows + # blue/green deployments. When the job is updated, a full set of the new + # version is deployed and upon promotion the old version is stopped. + canary = ${canary} + + # Specifies if the job should auto-promote to the canary version when all + # canaries become healthy during a deployment. Defaults to false which means + # canaries must be manually updated with the nomad deployment promote + # command. + auto_promote = ${auto_promote} + + # The "auto_revert" parameter specifies if the job should auto-revert to the + # last stable job on deployment failure. A job is marked as stable if all the + # allocations as part of its deployment were marked healthy. + auto_revert = ${auto_revert} +%{ endif } + } + + # The "group" stanza defines a series of tasks that should be co-located on + # the same Nomad client. Any task within a group will be placed on the same + # client. + # + # https://www.nomadproject.io/docs/job-specification/group + # + group "${job_name}-group-1" { + # The "count" parameter specifies the number of the task groups that should + # be running under this group. This value must be non-negative and defaults + # to 1. + count = ${group_count} + + # The volume stanza allows the group to specify that it requires a given + # volume from the cluster. The key of the stanza is the name of the volume + # as it will be exposed to task configuration. + # + # https://www.nomadproject.io/docs/job-specification/volume + %{ if use_host_volume } + volume "${job_name}-volume-1" { + type = "host" + read_only = false + source = "${volume_source}" + } + %{ endif } + + # The restart stanza configures a tasks's behavior on task failure. Restarts + # happen on the client that is running the task. + # + # https://www.nomadproject.io/docs/job-specification/restart + # + restart { + interval = "30m" + attempts = 40 + delay = "15s" + mode = "delay" + } + + # The constraint allows restricting the set of eligible nodes. Constraints + # may filter on attributes or client metadata. + # + # https://www.nomadproject.io/docs/job-specification/constraint + # + constraint { + attribute = "$${attr.cpu.arch}" + operator = "!=" + value = "arm64" + } + + constraint { + attribute = "$${node.class}" + value = "builder" + } + + # The network stanza specifies the networking requirements for the task + # group, including the network mode and port allocations. When scheduling + # jobs in Nomad they are provisioned across your fleet of machines along + # with other jobs and services. Because you don't know in advance what host + # your job will be provisioned on, Nomad will provide your tasks with + # network configuration when they start up. + # + # https://www.nomadproject.io/docs/job-specification/network + # + network { + port "${service_name}" { + static = ${port} + to = ${port} + } + } + + # The "task" stanza creates an individual unit of work, such as a Docker + # container, web application, or batch processing. + # + # https://www.nomadproject.io/docs/job-specification/task + # + task "${job_name}-task-1" { + # The "driver" parameter specifies the task driver that should be used to + # run the task. + driver = "exec" + + %{ if use_host_volume } + volume_mount { + volume = "${job_name}-volume-1" + destination = "${volume_destination}" + read_only = false + } + %{ endif } + + %{ if use_vault_provider } + vault { + policies = "${vault_kv_policy_name}" + } + %{ endif } + + # The "config" stanza specifies the driver configuration, which is passed + # directly to the driver to start the task. The details of configurations + # are specific to each driver, so please see specific driver + # documentation for more information. + config { + command = "local/prometheus-${version}.linux-amd64/prometheus" + args = [ + "--config.file=secrets/prometheus.yml", + "--storage.tsdb.path=${volume_destination}prometheus/", + "--storage.tsdb.retention.time=7d" + ] + } + + # The artifact stanza instructs Nomad to fetch and unpack a remote resource, + # such as a file, tarball, or binary. Nomad downloads artifacts using the + # popular go-getter library, which permits downloading artifacts from a + # variety of locations using a URL as the input source. + # + # https://www.nomadproject.io/docs/job-specification/artifact + # + artifact { + source = "${url}" + } + + # The "template" stanza instructs Nomad to manage a template, such as + # a configuration file or script. This template can optionally pull data + # from Consul or Vault to populate runtime configuration data. + # + # https://www.nomadproject.io/docs/job-specification/template + # + template { + change_mode = "noop" + change_signal = "SIGINT" + destination = "secrets/alerts.yml" + left_delimiter = "{{{" + right_delimiter = "}}}" + data = <<EOH +--- +groups: +- name: "Jenkins Job Health Exporter" + rules: + - alert: JenkinsJobHealthExporterFailures + expr: jenkins_job_failure{id=~".*"} > jenkins_job_success{id=~".*"} + for: 0m + labels: + severity: critical + annotations: + summary: "Jenkins Job Health detected high failure rate on jenkins jobs." + description: "Job: {{ $labels.id }}" + - alert: JenkinsJobHealthExporterUnstable + expr: jenkins_job_unstable{id=~".*"} > jenkins_job_success{id=~".*"} + for: 0m + labels: + severity: warning + annotations: + summary: "Jenkins Job Health detected high unstable rate on jenkins jobs." + description: "Job: {{ $labels.id }}" +- name: "Consul" + rules: + - alert: ConsulServiceHealthcheckFailed + expr: consul_catalog_service_node_healthy == 0 + for: 0m + labels: + severity: critical + annotations: + summary: "Consul service healthcheck failed (instance {{ $labels.instance }})." + description: "Service: `{{ $labels.service_name }}` Healthcheck: `{{ $labels.service_id }}`." + - alert: ConsulMissingMasterNode + expr: consul_raft_peers < 3 + for: 0m + labels: + severity: critical + annotations: + summary: "Consul missing master node (instance {{ $labels.instance }})." + description: "Numbers of consul raft peers should be 3, in order to preserve quorum." + - alert: ConsulAgentUnhealthy + expr: consul_health_node_status{status="critical"} == 1 + for: 0m + labels: + severity: critical + annotations: + summary: "Consul agent unhealthy (instance {{ $labels.instance }})." + description: "A Consul agent is down." +- name: "Hosts" + rules: + - alert: NodeDown + expr: up == 0 + for: 0m + labels: + severity: critical + annotations: + summary: "Prometheus target missing (instance {{ $labels.instance }})." + description: "A Prometheus target has disappeared. An exporter might be crashed." + - alert: HostOutOfMemory + expr: node_memory_MemAvailable_bytes / node_memory_MemTotal_bytes * 100 < 10 + for: 2m + labels: + severity: warning + annotations: + summary: "Host out of memory (instance {{ $labels.instance }})." + description: "Node memory is filling up (< 10% left)." + - alert: HostOomKillDetected + expr: increase(node_vmstat_oom_kill[1m]) > 0 + for: 0m + labels: + severity: warning + annotations: + summary: "Host OOM kill detected (instance {{ $labels.instance }})." + description: "OOM kill detected." + - alert: HostMemoryUnderMemoryPressure + expr: rate(node_vmstat_pgmajfault[1m]) > 1000 + for: 2m + labels: + severity: warning + annotations: + summary: "Host memory under memory pressure (instance {{ $labels.instance }})." + description: "The node is under heavy memory pressure. High rate of major page faults." + - alert: HostOutOfDiskSpace + expr: (node_filesystem_avail_bytes * 100) / node_filesystem_size_bytes < 10 and ON (instance, device, mountpoint) node_filesystem_readonly == 0 + for: 2m + labels: + severity: warning + annotations: + summary: "Host out of disk space (instance {{ $labels.instance }})." + description: "Disk is almost full (< 10% left)." + - alert: HostRaidDiskFailure + expr: node_md_disks{state="failed"} > 0 + for: 2m + labels: + severity: warning + annotations: + summary: "Host RAID disk failure (instance {{ $labels.instance }})." + description: "At least one device in RAID array on {{ $labels.instance }} failed. Array {{ $labels.md_device }} needs attention and possibly a disk swap." + - alert: HostConntrackLimit + expr: node_nf_conntrack_entries / node_nf_conntrack_entries_limit > 0.8 + for: 5m + labels: + severity: warning + annotations: + summary: "Host conntrack limit (instance {{ $labels.instance }})." + description: "The number of conntrack is approching limit." + - alert: HostNetworkInterfaceSaturated + expr: (rate(node_network_receive_bytes_total{device!~"^tap.*"}[1m]) + rate(node_network_transmit_bytes_total{device!~"^tap.*"}[1m])) / node_network_speed_bytes{device!~"^tap.*"} > 0.8 + for: 1m + labels: + severity: warning + annotations: + summary: "Host Network Interface Saturated (instance {{ $labels.instance }})." + description: "The network interface {{ $labels.interface }} on {{ $labels.instance }} is getting overloaded." + - alert: HostSystemdServiceCrashed + expr: node_systemd_unit_state{state="failed"} == 1 + for: 0m + labels: + severity: warning + annotations: + summary: "Host SystemD service crashed (instance {{ $labels.instance }})." + description: "SystemD service crashed." + - alert: HostEdacCorrectableErrorsDetected + expr: increase(node_edac_correctable_errors_total[1m]) > 0 + for: 0m + labels: + severity: info + annotations: + summary: "Host EDAC Correctable Errors detected (instance {{ $labels.instance }})." + description: '{{ $labels.instance }} has had {{ printf "%.0f" $value }} correctable memory errors reported by EDAC in the last 5 minutes.' + - alert: HostEdacUncorrectableErrorsDetected + expr: node_edac_uncorrectable_errors_total > 0 + for: 0m + labels: + severity: warning + annotations: + summary: "Host EDAC Uncorrectable Errors detected (instance {{ $labels.instance }})." + description: '{{ $labels.instance }} has had {{ printf "%.0f" $value }} uncorrectable memory errors reported by EDAC in the last 5 minutes.' +- name: "Min.io" + rules: + - alert: MinioDiskOffline + expr: minio_offline_disks > 0 + for: 0m + labels: + severity: critical + annotations: + summary: "Minio disk offline (instance {{ $labels.instance }})" + description: "Minio disk is offline." + - alert: MinioStorageSpaceExhausted + expr: minio_disk_storage_free_bytes / 1024 / 1024 / 1024 < 10 + for: 2m + labels: + severity: warning + annotations: + summary: "Minio storage space exhausted (instance {{ $labels.instance }})." + description: "Minio storage space is low (< 10 GB)." +- name: "Prometheus" + rules: + - alert: PrometheusConfigurationReloadFailure + expr: prometheus_config_last_reload_successful != 1 + for: 0m + labels: + severity: warning + annotations: + summary: "Prometheus configuration reload failure (instance {{ $labels.instance }})." + description: "Prometheus configuration reload error." + - alert: PrometheusTooManyRestarts + expr: changes(process_start_time_seconds{job=~"prometheus|pushgateway|alertmanager"}[15m]) > 2 + for: 0m + labels: + severity: warning + annotations: + summary: "Prometheus too many restarts (instance {{ $labels.instance }})." + description: "Prometheus has restarted more than twice in the last 15 minutes. It might be crashlooping." + - alert: PrometheusAlertmanagerConfigurationReloadFailure + expr: alertmanager_config_last_reload_successful != 1 + for: 0m + labels: + severity: warning + annotations: + summary: "Prometheus AlertManager configuration reload failure (instance {{ $labels.instance }})." + description: "AlertManager configuration reload error." + - alert: PrometheusRuleEvaluationFailures + expr: increase(prometheus_rule_evaluation_failures_total[3m]) > 0 + for: 0m + labels: + severity: critical + annotations: + summary: "Prometheus rule evaluation failures (instance {{ $labels.instance }})." + description: "Prometheus encountered {{ $value }} rule evaluation failures, leading to potentially ignored alerts." + - alert: PrometheusTargetScrapingSlow + expr: prometheus_target_interval_length_seconds{quantile="0.9"} > 60 + for: 5m + labels: + severity: warning + annotations: + summary: "Prometheus target scraping slow (instance {{ $labels.instance }})." + description: "Prometheus is scraping exporters slowly." + - alert: PrometheusTsdbCompactionsFailed + expr: increase(prometheus_tsdb_compactions_failed_total[1m]) > 0 + for: 0m + labels: + severity: critical + annotations: + summary: "Prometheus TSDB compactions failed (instance {{ $labels.instance }})." + description: "Prometheus encountered {{ $value }} TSDB compactions failures." + - alert: PrometheusTsdbHeadTruncationsFailed + expr: increase(prometheus_tsdb_head_truncations_failed_total[1m]) > 0 + for: 0m + labels: + severity: critical + annotations: + summary: "Prometheus TSDB head truncations failed (instance {{ $labels.instance }})." + description: "Prometheus encountered {{ $value }} TSDB head truncation failures." + - alert: PrometheusTsdbWalCorruptions + expr: increase(prometheus_tsdb_wal_corruptions_total[1m]) > 0 + for: 0m + labels: + severity: critical + annotations: + summary: "Prometheus TSDB WAL corruptions (instance {{ $labels.instance }})." + description: "Prometheus encountered {{ $value }} TSDB WAL corruptions." + - alert: PrometheusTsdbWalTruncationsFailed + expr: increase(prometheus_tsdb_wal_truncations_failed_total[1m]) > 0 + for: 0m + labels: + severity: critical + annotations: + summary: "Prometheus TSDB WAL truncations failed (instance {{ $labels.instance }})." + description: "Prometheus encountered {{ $value }} TSDB WAL truncation failures." +EOH + } + + template { + change_mode = "noop" + change_signal = "SIGINT" + destination = "secrets/prometheus.yml" + data = <<EOH +--- +global: + scrape_interval: 5s + scrape_timeout: 5s + evaluation_interval: 5s + +alerting: + alertmanagers: + - consul_sd_configs: + - server: '{{ env "NOMAD_IP_prometheus" }}:8500' + services: [ 'alertmanager' ] + +rule_files: + - 'alerts.yml' + +scrape_configs: + + - job_name: 'Nomad Cluster' + consul_sd_configs: + - server: '{{ env "NOMAD_IP_prometheus" }}:8500' + services: [ 'nomad-client', 'nomad' ] + relabel_configs: + - source_labels: [__meta_consul_tags] + regex: '(.*)http(.*)' + action: keep + metrics_path: /v1/metrics + params: + format: [ 'prometheus' ] + + - job_name: 'Consul Cluster' + static_configs: + - targets: [ '10.30.51.22:8500' ] + - targets: [ '10.30.51.24:8500' ] + - targets: [ '10.30.51.25:8500' ] + - targets: [ '10.30.51.26:8500' ] + - targets: [ '10.30.51.28:8500' ] + - targets: [ '10.30.51.29:8500' ] + - targets: [ '10.30.51.30:8500' ] + - targets: [ '10.30.51.39:8500' ] + - targets: [ '10.30.51.40:8500' ] + - targets: [ '10.30.51.50:8500' ] + - targets: [ '10.30.51.51:8500' ] + - targets: [ '10.30.51.65:8500' ] + - targets: [ '10.30.51.66:8500' ] + - targets: [ '10.30.51.67:8500' ] + - targets: [ '10.30.51.68:8500' ] + - targets: [ '10.30.51.70:8500' ] + - targets: [ '10.30.51.71:8500' ] + - targets: [ '10.32.8.14:8500' ] + - targets: [ '10.32.8.15:8500' ] + - targets: [ '10.32.8.16:8500' ] + - targets: [ '10.32.8.17:8500' ] + metrics_path: /v1/agent/metrics + params: + format: [ 'prometheus' ] + + - job_name: 'Blackbox Exporter (icmp)' + static_configs: + - targets: [ 'gerrit.fd.io' ] + - targets: [ 'jenkins.fd.io' ] + - targets: [ '10.32.8.17' ] + params: + module: [ 'icmp_v4' ] + relabel_configs: + - source_labels: [__address__] + target_label: __param_target + - source_labels: [__param_target] + target_label: instance + - target_label: __address__ + replacement: localhost:9115 + metrics_path: /probe + + - job_name: 'Blackbox Exporter (http)' + static_configs: + - targets: [ 'gerrit.fd.io' ] + - targets: [ 'jenkins.fd.io' ] + params: + module: [ 'http_2xx' ] + relabel_configs: + - source_labels: [__address__] + target_label: __param_target + - source_labels: [__param_target] + target_label: instance + - target_label: __address__ + replacement: localhost:9115 + metrics_path: /probe + + - job_name: 'Jenkins Job Health Exporter' + static_configs: + - targets: [ '10.30.51.22:9186' ] + metric_relabel_configs: + - source_labels: [ __name__ ] + regex: '^(vpp.*|csit.*)_(success|failure|total|unstable|reqtime_ms)$' + action: replace + replacement: '$1' + target_label: id + - source_labels: [ __name__ ] + regex: '^(vpp.*|csit.*)_(success|failure|total|unstable|reqtime_ms)$' + replacement: 'jenkins_job_$2' + target_label: __name__ + + - job_name: 'Node Exporter' + static_configs: + - targets: [ '10.30.51.22:9100' ] + - targets: [ '10.30.51.24:9100' ] + - targets: [ '10.30.51.25:9100' ] + - targets: [ '10.30.51.26:9100' ] + - targets: [ '10.30.51.28:9100' ] + - targets: [ '10.30.51.29:9100' ] + - targets: [ '10.30.51.30:9100' ] + - targets: [ '10.30.51.39:9100' ] + - targets: [ '10.30.51.40:9100' ] + - targets: [ '10.30.51.50:9100' ] + - targets: [ '10.30.51.51:9100' ] + - targets: [ '10.30.51.65:9100' ] + - targets: [ '10.30.51.66:9100' ] + - targets: [ '10.30.51.67:9100' ] + - targets: [ '10.30.51.68:9100' ] + - targets: [ '10.30.51.70:9100' ] + - targets: [ '10.30.51.71:9100' ] + - targets: [ '10.32.8.14:9100' ] + - targets: [ '10.32.8.15:9100' ] + - targets: [ '10.32.8.16:9100' ] + - targets: [ '10.32.8.17:9100' ] + + - job_name: 'Alertmanager' + consul_sd_configs: + - server: '{{ env "NOMAD_IP_prometheus" }}:8500' + services: [ 'alertmanager' ] + + - job_name: 'Grafana' + consul_sd_configs: + - server: '{{ env "NOMAD_IP_prometheus" }}:8500' + services: [ 'grafana' ] + + - job_name: 'Prometheus' + consul_sd_configs: + - server: '{{ env "NOMAD_IP_prometheus" }}:8500' + services: [ 'prometheus' ] + + - job_name: 'Minio' + bearer_token: eyJhbGciOiJIUzUxMiIsInR5cCI6IkpXVCJ9.eyJleHAiOjQ3NjQ1ODEzMzcsImlzcyI6InByb21ldGhldXMiLCJzdWIiOiJtaW5pbyJ9.oeTw3EIaiFmlDikrHXWiWXMH2vxLfDLkfjEC7G2N3M_keH_xyA_l2ofLLNYtopa_3GCEZnxLQdPuFZrmgpkDWg + consul_sd_configs: + - server: '{{ env "NOMAD_IP_prometheus" }}:8500' + services: [ 'storage' ] + metrics_path: /minio/prometheus/metrics + + - job_name: 'Minio Proxy' + bearer_token: eyJhbGciOiJIUzUxMiIsInR5cCI6IkpXVCJ9.eyJleHAiOjQ3OTAwNjE1NDIsImlzcyI6InByb21ldGhldXMiLCJzdWIiOiJBS0lBUTJBSDdZUFBXVDZDV1hYSSJ9.CU9x9j-yO0_Uta5iep6yqNiGQPolrr2608E3lpU6Yg21rIv_eOwS5zqzXaSvrhzkJP9H5kO1Pj6kqjYhbqjN_w + consul_sd_configs: + - server: '{{ env "NOMAD_IP_prometheus" }}:8500' + services: [ 'minio' ] + metrics_path: /minio/v2/metrics/cluster +EOH + } + + # The service stanza instructs Nomad to register a service with Consul. + # + # https://www.nomadproject.io/docs/job-specification/service + # + service { + name = "${service_name}" + port = "${service_name}" + tags = [ "${service_name}$${NOMAD_ALLOC_INDEX}" ] + check { + name = "Prometheus Check Live" + type = "http" + path = "/-/healthy" + interval = "10s" + timeout = "2s" + } + } + + # The "resources" stanza describes the requirements a task needs to + # execute. Resource requirements include memory, network, cpu, and more. + # This ensures the task will execute on a machine that contains enough + # resource capacity. + # + # https://www.nomadproject.io/docs/job-specification/resources + # + resources { + cpu = ${cpu} + memory = ${memory} + } + } + } +} diff --git a/fdio.infra.terraform/1n_nmd/prometheus/fdio/main.tf b/fdio.infra.terraform/1n_nmd/prometheus/fdio/main.tf new file mode 100644 index 0000000000..e0ca417a78 --- /dev/null +++ b/fdio.infra.terraform/1n_nmd/prometheus/fdio/main.tf @@ -0,0 +1,10 @@ +module "prometheus" { + providers = { + nomad = nomad.yul1 + } + source = "../" + + # prometheus + datacenters = ["yul1"] + pm_version = "2.33.1" +}
\ No newline at end of file diff --git a/fdio.infra.terraform/1n_nmd/prometheus/fdio/providers.tf b/fdio.infra.terraform/1n_nmd/prometheus/fdio/providers.tf new file mode 100644 index 0000000000..42a6a45ce0 --- /dev/null +++ b/fdio.infra.terraform/1n_nmd/prometheus/fdio/providers.tf @@ -0,0 +1,13 @@ +provider "nomad" { + address = var.nomad_provider_address + alias = "yul1" + # ca_file = var.nomad_provider_ca_file + # cert_file = var.nomad_provider_cert_file + # key_file = var.nomad_provider_key_file +} + +provider "vault" { + address = var.vault_provider_address + skip_tls_verify = var.vault_provider_skip_tls_verify + token = var.vault_provider_token +}
\ No newline at end of file diff --git a/fdio.infra.terraform/1n_nmd/prometheus/fdio/variables.tf b/fdio.infra.terraform/1n_nmd/prometheus/fdio/variables.tf new file mode 100644 index 0000000000..7d5be09d21 --- /dev/null +++ b/fdio.infra.terraform/1n_nmd/prometheus/fdio/variables.tf @@ -0,0 +1,47 @@ +variable "nomad_acl" { + description = "Nomad ACLs enabled/disabled." + type = bool + default = false +} + +variable "nomad_provider_address" { + description = "FD.io Nomad cluster address." + type = string + default = "http://10.32.8.14:4646" +} + +variable "nomad_provider_ca_file" { + description = "A local file path to a PEM-encoded certificate authority." + type = string + default = "/etc/nomad.d/ssl/nomad-ca.pem" +} + +variable "nomad_provider_cert_file" { + description = "A local file path to a PEM-encoded certificate." + type = string + default = "/etc/nomad.d/ssl/nomad-cli.pem" +} + +variable "nomad_provider_key_file" { + description = "A local file path to a PEM-encoded private key." + type = string + default = "/etc/nomad.d/ssl/nomad-cli-key.pem" +} + +variable "vault_provider_address" { + description = "Vault cluster address." + type = string + default = "http://10.30.51.28:8200" +} + +variable "vault_provider_skip_tls_verify" { + description = "Verification of the Vault server's TLS certificate." + type = bool + default = false +} + +variable "vault_provider_token" { + description = "Vault root token." + type = string + sensitive = true +}
\ No newline at end of file diff --git a/fdio.infra.terraform/1n_nmd/prometheus/fdio/versions.tf b/fdio.infra.terraform/1n_nmd/prometheus/fdio/versions.tf new file mode 100644 index 0000000000..f83709d154 --- /dev/null +++ b/fdio.infra.terraform/1n_nmd/prometheus/fdio/versions.tf @@ -0,0 +1,17 @@ +terraform { + backend "consul" { + address = "10.32.8.14:8500" + scheme = "http" + path = "terraform/prometheus" + } + required_providers { + nomad = { + source = "hashicorp/nomad" + version = ">= 1.4.16" + } + vault = { + version = ">= 3.2.1" + } + } + required_version = ">= 1.1.4" +}
\ No newline at end of file diff --git a/fdio.infra.terraform/1n_nmd/prometheus/main.tf b/fdio.infra.terraform/1n_nmd/prometheus/main.tf index 0c504c9ca4..0a4d987831 100644 --- a/fdio.infra.terraform/1n_nmd/prometheus/main.tf +++ b/fdio.infra.terraform/1n_nmd/prometheus/main.tf @@ -1,37 +1,42 @@ locals { - datacenters = join(",", var.nomad_datacenters) - - prometheus_url = join("", + datacenters = join(",", var.datacenters) + url = join("", [ "https://github.com", "/prometheus/prometheus/releases/download/", - "v${var.prometheus_version}/", - "prometheus-${var.prometheus_version}.linux-amd64.tar.gz" + "v${var.pm_version}/", + "prometheus-${var.pm_version}.linux-amd64.tar.gz" ] ) } -data "template_file" "nomad_job_prometheus" { - template = file("${path.module}/conf/nomad/prometheus.hcl") - vars = { - datacenters = local.datacenters - url = local.prometheus_url - job_name = var.prometheus_job_name - use_canary = var.prometheus_use_canary - group_count = var.prometheus_group_count - use_host_volume = var.prometheus_use_host_volume - host_volume = var.nomad_host_volume - data_dir = var.prometheus_data_dir - service_name = var.prometheus_service_name - use_vault_provider = var.prometheus_vault_secret.use_vault_provider - version = var.prometheus_version - cpu = var.prometheus_cpu - mem = var.prometheus_mem - port = var.prometheus_port - } -} - resource "nomad_job" "nomad_job_prometheus" { - jobspec = data.template_file.nomad_job_prometheus.rendered - detach = false -}
\ No newline at end of file + jobspec = templatefile( + "${path.module}/conf/nomad/prometheus.hcl.tftpl", + { + auto_promote = var.auto_promote, + auto_revert = var.auto_revert, + canary = var.canary, + cpu = var.cpu, + datacenters = local.datacenters, + group_count = var.group_count, + job_name = var.job_name, + max_parallel = var.max_parallel, + memory = var.memory + port = var.port, + region = var.region, + service_name = var.service_name, + url = local.url, + use_canary = var.use_canary, + use_host_volume = var.use_host_volume, + use_vault_provider = var.vault_secret.use_vault_provider, + vault_kv_policy_name = var.vault_secret.vault_kv_policy_name, + vault_kv_path = var.vault_secret.vault_kv_path, + vault_kv_field_access_key = var.vault_secret.vault_kv_field_access_key, + vault_kv_field_secret_key = var.vault_secret.vault_kv_field_secret_key, + version = var.pm_version, + volume_destination = var.volume_destination, + volume_source = var.volume_source + }) + detach = false +} diff --git a/fdio.infra.terraform/1n_nmd/prometheus/variables.tf b/fdio.infra.terraform/1n_nmd/prometheus/variables.tf index befd9412cb..d44f9d5f8a 100644 --- a/fdio.infra.terraform/1n_nmd/prometheus/variables.tf +++ b/fdio.infra.terraform/1n_nmd/prometheus/variables.tf @@ -1,84 +1,127 @@ # Nomad -variable "nomad_datacenters" { - description = "Nomad data centers" +variable "datacenters" { + description = "Specifies the list of DCs to be considered placing this task" type = list(string) default = ["dc1"] } -variable "nomad_host_volume" { - description = "Nomad Host Volume" +variable "region" { + description = "Specifies the list of DCs to be considered placing this task" + type = string + default = "global" +} + +variable "volume_source" { + description = "The name of the volume to request" type = string default = "persistence" } # Prometheus -variable "prometheus_job_name" { - description = "Prometheus job name" +variable "pm_version" { + description = "Prometheus version" type = string - default = "prometheus" + default = "2.33.1" +} + +variable "auto_promote" { + description = "Specifies if the job should auto-promote to the canary version" + type = bool + default = true +} + +variable "auto_revert" { + description = "Specifies if the job should auto-revert to the last stable job" + type = bool + default = true } -variable "prometheus_group_count" { - description = "Number of prometheus group instances" +variable "canary" { + description = "Equal to the count of the task group allows blue/green depl." type = number default = 1 } -variable "prometheus_service_name" { - description = "Prometheus service name" - type = string - default = "prometheus" +variable "cpu" { + description = "CPU allocation" + type = number + default = 2000 } -variable "prometheus_version" { - description = "Prometheus version" +variable "data_dir" { + description = "Prometheus DISK allocation" type = string - default = "v2.28.1" + default = "/data" } -variable "prometheus_use_canary" { - description = "Uses canary deployment" - type = bool - default = false +variable "group_count" { + description = "Specifies the number of the task groups running under this one" + type = number + default = 4 } -variable "prometheus_vault_secret" { - description = "Set of properties to be able to fetch secret from vault" - type = object({ - use_vault_provider = bool, - vault_kv_policy_name = string, - vault_kv_path = string, - vault_kv_field_access_key = string, - vault_kv_field_secret_key = string - }) +variable "job_name" { + description = "Specifies a name for the job" + type = string + default = "prometheus" } -variable "prometheus_cpu" { - description = "Prometheus CPU allocation" +variable "max_parallel" { + description = "Specifies the maximum number of updates to perform in parallel" type = number - default = 2000 + default = 1 } -variable "prometheus_mem" { - description = "Prometheus RAM allocation" +variable "memory" { + description = "Specifies the memory required in MB" type = number - default = 8192 + default = 4096 } -variable "prometheus_port" { - description = "Prometheus TCP allocation" +variable "port" { + description = "Specifies the static TCP/UDP port to allocate" type = number - default = 9200 + default = 9090 } -variable "prometheus_data_dir" { - description = "Prometheus DISK allocation" +variable "service_name" { + description = "Specifies the name this service will be advertised in Consul" type = string - default = "/data" + default = "prometheus" +} + +variable "use_canary" { + description = "Uses canary deployment" + type = bool + default = true } -variable "prometheus_use_host_volume" { +variable "use_host_volume" { description = "Use Nomad host volume feature" type = bool default = false -}
\ No newline at end of file +} + +variable "volume_destination" { + description = "Specifies where the volume should be mounted inside the task" + type = string + default = "/data/" +} + +variable "vault_secret" { + type = object({ + use_vault_provider = bool, + vault_kv_policy_name = string, + vault_kv_path = string, + vault_kv_field_access_key = string, + vault_kv_field_secret_key = string + }) + description = "Set of properties to be able to fetch secret from vault." + default = { + use_vault_provider = false + vault_kv_policy_name = "kv" + vault_kv_path = "secret/data/prometheus" + vault_kv_field_access_key = "access_key" + vault_kv_field_secret_key = "secret_key" + } +} diff --git a/fdio.infra.terraform/1n_nmd/prometheus/versions.tf b/fdio.infra.terraform/1n_nmd/prometheus/versions.tf index b80610a525..a01708f28a 100644 --- a/fdio.infra.terraform/1n_nmd/prometheus/versions.tf +++ b/fdio.infra.terraform/1n_nmd/prometheus/versions.tf @@ -2,12 +2,8 @@ terraform { required_providers { nomad = { source = "hashicorp/nomad" - version = "~> 1.4.15" - } - template = { - source = "hashicorp/template" - version = "~> 2.2.0" + version = ">= 1.4.16" } } - required_version = ">= 1.0.3" + required_version = ">= 1.1.4" } |