diff options
Diffstat (limited to 'fdio.infra.terraform/terraform-nomad-alertmanager')
8 files changed, 682 insertions, 0 deletions
diff --git a/fdio.infra.terraform/terraform-nomad-alertmanager/conf/nomad/alertmanager.hcl.tftpl b/fdio.infra.terraform/terraform-nomad-alertmanager/conf/nomad/alertmanager.hcl.tftpl new file mode 100644 index 0000000000..87206ac5a0 --- /dev/null +++ b/fdio.infra.terraform/terraform-nomad-alertmanager/conf/nomad/alertmanager.hcl.tftpl @@ -0,0 +1,377 @@ +job "${job_name}" { + # The "region" parameter specifies the region in which to execute the job. + # If omitted, this inherits the default region name of "global". + # region = "${region}" + + # The "datacenters" parameter specifies the list of datacenters which should + # be considered when placing this task. This must be provided. + datacenters = "${datacenters}" + + # The "type" parameter controls the type of job, which impacts the scheduler's + # decision on placement. This configuration is optional and defaults to + # "service". For a full list of job types and their differences, please see + # the online documentation. + # + # https://www.nomadproject.io/docs/jobspec/schedulers + # + type = "service" + + update { + # The "max_parallel" parameter specifies the maximum number of updates to + # perform in parallel. In this case, this specifies to update a single task + # at a time. + max_parallel = ${max_parallel} + + health_check = "checks" + + # The "min_healthy_time" parameter specifies the minimum time the allocation + # must be in the healthy state before it is marked as healthy and unblocks + # further allocations from being updated. + min_healthy_time = "10s" + + # The "healthy_deadline" parameter specifies the deadline in which the + # allocation must be marked as healthy after which the allocation is + # automatically transitioned to unhealthy. Transitioning to unhealthy will + # fail the deployment and potentially roll back the job if "auto_revert" is + # set to true. + healthy_deadline = "3m" + + # The "progress_deadline" parameter specifies the deadline in which an + # allocation must be marked as healthy. The deadline begins when the first + # allocation for the deployment is created and is reset whenever an allocation + # as part of the deployment transitions to a healthy state. If no allocation + # transitions to the healthy state before the progress deadline, the + # deployment is marked as failed. + progress_deadline = "10m" + +%{ if use_canary } + # The "canary" parameter specifies that changes to the job that would result + # in destructive updates should create the specified number of canaries + # without stopping any previous allocations. Once the operator determines the + # canaries are healthy, they can be promoted which unblocks a rolling update + # of the remaining allocations at a rate of "max_parallel". + # + # Further, setting "canary" equal to the count of the task group allows + # blue/green deployments. When the job is updated, a full set of the new + # version is deployed and upon promotion the old version is stopped. + canary = ${canary} + + # Specifies if the job should auto-promote to the canary version when all + # canaries become healthy during a deployment. Defaults to false which means + # canaries must be manually updated with the nomad deployment promote + # command. + auto_promote = ${auto_promote} + + # The "auto_revert" parameter specifies if the job should auto-revert to the + # last stable job on deployment failure. A job is marked as stable if all the + # allocations as part of its deployment were marked healthy. + auto_revert = ${auto_revert} +%{ endif } + } + + # All groups in this job should be scheduled on different hosts. + constraint { + operator = "distinct_hosts" + value = "true" + } + + # The "group" stanza defines a series of tasks that should be co-located on + # the same Nomad client. Any task within a group will be placed on the same + # client. + # + # https://www.nomadproject.io/docs/job-specification/group + # + group "${job_name}-group-1" { + # The "count" parameter specifies the number of the task groups that should + # be running under this group. This value must be non-negative and defaults + # to 1. + count = ${group_count} + + # The volume stanza allows the group to specify that it requires a given + # volume from the cluster. The key of the stanza is the name of the volume + # as it will be exposed to task configuration. + # + # https://www.nomadproject.io/docs/job-specification/volume + %{ if use_host_volume } + volume "${job_name}-volume-1" { + type = "host" + read_only = false + source = "${volume_source}" + } + %{ endif } + + # The restart stanza configures a tasks's behavior on task failure. Restarts + # happen on the client that is running the task. + # + # https://www.nomadproject.io/docs/job-specification/restart + # + restart { + interval = "30m" + attempts = 40 + delay = "15s" + mode = "delay" + } + + # The constraint allows restricting the set of eligible nodes. Constraints + # may filter on attributes or client metadata. + # + # https://www.nomadproject.io/docs/job-specification/constraint + # + constraint { + attribute = "$${attr.cpu.arch}" + operator = "!=" + value = "arm64" + } + + constraint { + attribute = "$${node.class}" + value = "builder" + } + + # The network stanza specifies the networking requirements for the task + # group, including the network mode and port allocations. When scheduling + # jobs in Nomad they are provisioned across your fleet of machines along + # with other jobs and services. Because you don't know in advance what host + # your job will be provisioned on, Nomad will provide your tasks with + # network configuration when they start up. + # + # https://www.nomadproject.io/docs/job-specification/network + # + network { + port "${service_name}" { + static = ${port} + to = ${port} + } + } + + # The "task" stanza creates an individual unit of work, such as a Docker + # container, web application, or batch processing. + # + # https://www.nomadproject.io/docs/job-specification/task + # + task "${job_name}-task-1" { + # The "driver" parameter specifies the task driver that should be used to + # run the task. + driver = "exec" + + %{ if use_host_volume } + volume_mount { + volume = "${job_name}-volume-1" + destination = "${volume_destination}" + read_only = false + } + %{ endif } + + %{ if use_vault_provider } + vault { + policies = "${vault_kv_policy_name}" + } + %{ endif } + + # The "config" stanza specifies the driver configuration, which is passed + # directly to the driver to start the task. The details of configurations + # are specific to each driver, so please see specific driver + # documentation for more information. + config { + command = "local/alertmanager-${version}.linux-amd64/alertmanager" + args = [ + "--config.file=secrets/alertmanager.yml" + ] + } + + # The artifact stanza instructs Nomad to fetch and unpack a remote resource, + # such as a file, tarball, or binary. Nomad downloads artifacts using the + # popular go-getter library, which permits downloading artifacts from a + # variety of locations using a URL as the input source. + # + # https://www.nomadproject.io/docs/job-specification/artifact + # + artifact { + source = "${url}" + } + + # The "template" stanza instructs Nomad to manage a template, such as + # a configuration file or script. This template can optionally pull data + # from Consul or Vault to populate runtime configuration data. + # + # https://www.nomadproject.io/docs/job-specification/template + # + template { + change_mode = "noop" + change_signal = "SIGINT" + destination = "secrets/alertmanager.yml" + left_delimiter = "{{{" + right_delimiter = "}}}" + data = <<EOH +# The directory from which notification templates are read. +templates: +- '/etc/alertmanager/template/*.tmpl' + +#tls_config: +# # CA certificate to validate the server certificate with. +# ca_file: <filepath> ] +# +# # Certificate and key files for client cert authentication to the server. +# cert_file: <filepath> +# key_file: <filepath> +# +# # ServerName extension to indicate the name of the server. +# # http://tools.ietf.org/html/rfc4366#section-3.1 +# server_name: <string> +# +# # Disable validation of the server certificate. +# insecure_skip_verify: true + +# The root route on which each incoming alert enters. +route: + receiver: '${slack_default_receiver}' + + # The labels by which incoming alerts are grouped together. For example, + # multiple alerts coming in for cluster=A and alertname=LatencyHigh would + # be batched into a single group. + # + # To aggregate by all possible labels use '...' as the sole label name. + # This effectively disables aggregation entirely, passing through all + # alerts as-is. This is unlikely to be what you want, unless you have + # a very low alert volume or your upstream notification system performs + # its own grouping. Example: group_by: [...] + group_by: ['alertname'] + + # When a new group of alerts is created by an incoming alert, wait at + # least 'group_wait' to send the initial notification. + # This way ensures that you get multiple alerts for the same group that start + # firing shortly after another are batched together on the first + # notification. + group_wait: 30s + + # When the first notification was sent, wait 'group_interval' to send a batch + # of new alerts that started firing for that group. + group_interval: 5m + + # If an alert has successfully been sent, wait 'repeat_interval' to + # resend them. + repeat_interval: 3h + + # All the above attributes are inherited by all child routes and can + # overwritten on each. + # The child route trees. + routes: + - match_re: + alertname: JenkinsJob.* + receiver: ${slack_jenkins_receiver} + routes: + - match: + severity: critical + receiver: '${slack_jenkins_receiver}' + + - match_re: + service: .* + receiver: ${slack_default_receiver} + routes: + - match: + severity: critical + receiver: '${slack_default_receiver}' + +# Inhibition rules allow to mute a set of alerts given that another alert is +# firing. +# We use this to mute any warning-level notifications if the same alert is +# already critical. +inhibit_rules: +- source_match: + severity: 'critical' + target_match: + severity: 'warning' + equal: ['alertname', 'instance'] + +receivers: +- name: '${slack_jenkins_receiver}' + slack_configs: + - api_url: 'https://hooks.slack.com/services/${slack_jenkins_api_key}' + channel: '#${slack_jenkins_channel}' + send_resolved: true + icon_url: https://avatars3.githubusercontent.com/u/3380462 + title: |- + [{{ .Status | toUpper }}{{ if eq .Status "firing" }}:{{ .Alerts.Firing | len }}{{ end }}] {{ .CommonLabels.alertname }} for {{ .CommonLabels.job }} + {{- if gt (len .CommonLabels) (len .GroupLabels) -}} + {{" "}}( + {{- with .CommonLabels.Remove .GroupLabels.Names }} + {{- range $index, $label := .SortedPairs -}} + {{ if $index }}, {{ end }} + {{- $label.Name }}="{{ $label.Value -}}" + {{- end }} + {{- end -}} + ) + {{- end }} + text: >- + {{ range .Alerts -}} + *Alert:* {{ .Annotations.summary }}{{ if .Labels.severity }} - `{{ .Labels.severity }}`{{ end }} + + *Description:* {{ .Annotations.description }} + + *Details:* + {{ range .Labels.SortedPairs }} • *{{ .Name }}:* `{{ .Value }}` + {{ end }} + {{ end }} + +- name: '${slack_default_receiver}' + slack_configs: + - api_url: 'https://hooks.slack.com/services/${slack_default_api_key}' + channel: '#${slack_default_channel}' + send_resolved: true + icon_url: https://avatars3.githubusercontent.com/u/3380462 + title: |- + [{{ .Status | toUpper }}{{ if eq .Status "firing" }}:{{ .Alerts.Firing | len }}{{ end }}] {{ .CommonLabels.alertname }} for {{ .CommonLabels.job }} + {{- if gt (len .CommonLabels) (len .GroupLabels) -}} + {{" "}}( + {{- with .CommonLabels.Remove .GroupLabels.Names }} + {{- range $index, $label := .SortedPairs -}} + {{ if $index }}, {{ end }} + {{- $label.Name }}="{{ $label.Value -}}" + {{- end }} + {{- end -}} + ) + {{- end }} + text: >- + {{ range .Alerts -}} + *Alert:* {{ .Annotations.summary }}{{ if .Labels.severity }} - `{{ .Labels.severity }}`{{ end }} + + *Description:* {{ .Annotations.description }} + + *Details:* + {{ range .Labels.SortedPairs }} • *{{ .Name }}:* `{{ .Value }}` + {{ end }} + {{ end }} +EOH + } + + # The service stanza instructs Nomad to register a service with Consul. + # + # https://www.nomadproject.io/docs/job-specification/service + # + service { + name = "${service_name}" + port = "${service_name}" + tags = [ "${service_name}$${NOMAD_ALLOC_INDEX}" ] + check { + name = "Alertmanager Check Live" + type = "http" + path = "/-/healthy" + interval = "10s" + timeout = "2s" + } + } + + # The "resources" stanza describes the requirements a task needs to + # execute. Resource requirements include memory, network, cpu, and more. + # This ensures the task will execute on a machine that contains enough + # resource capacity. + # + # https://www.nomadproject.io/docs/job-specification/resources + # + resources { + cpu = ${cpu} + memory = ${memory} + } + } + } +} diff --git a/fdio.infra.terraform/terraform-nomad-alertmanager/fdio/main.tf b/fdio.infra.terraform/terraform-nomad-alertmanager/fdio/main.tf new file mode 100644 index 0000000000..745e450a8c --- /dev/null +++ b/fdio.infra.terraform/terraform-nomad-alertmanager/fdio/main.tf @@ -0,0 +1,14 @@ +module "alertmanager" { + providers = { + nomad = nomad.yul1 + } + source = "../" + + # alertmanager + datacenters = ["yul1"] + slack_jenkins_api_key = "TE07RD1V1/B01U1NV9HV3/hKZXJJ74g2JcISq4K3QC1eG9" + slack_jenkins_channel = "fdio-jobs-monitoring" + slack_default_api_key = "TE07RD1V1/B01UUK23B6C/hZTcCu42FUv8d6rtirHtcYIi" + slack_default_channel = "fdio-infra-monitoring" + am_version = "0.23.0" +}
\ No newline at end of file diff --git a/fdio.infra.terraform/terraform-nomad-alertmanager/fdio/providers.tf b/fdio.infra.terraform/terraform-nomad-alertmanager/fdio/providers.tf new file mode 100644 index 0000000000..42a6a45ce0 --- /dev/null +++ b/fdio.infra.terraform/terraform-nomad-alertmanager/fdio/providers.tf @@ -0,0 +1,13 @@ +provider "nomad" { + address = var.nomad_provider_address + alias = "yul1" + # ca_file = var.nomad_provider_ca_file + # cert_file = var.nomad_provider_cert_file + # key_file = var.nomad_provider_key_file +} + +provider "vault" { + address = var.vault_provider_address + skip_tls_verify = var.vault_provider_skip_tls_verify + token = var.vault_provider_token +}
\ No newline at end of file diff --git a/fdio.infra.terraform/terraform-nomad-alertmanager/fdio/variables.tf b/fdio.infra.terraform/terraform-nomad-alertmanager/fdio/variables.tf new file mode 100644 index 0000000000..7d5be09d21 --- /dev/null +++ b/fdio.infra.terraform/terraform-nomad-alertmanager/fdio/variables.tf @@ -0,0 +1,47 @@ +variable "nomad_acl" { + description = "Nomad ACLs enabled/disabled." + type = bool + default = false +} + +variable "nomad_provider_address" { + description = "FD.io Nomad cluster address." + type = string + default = "http://10.32.8.14:4646" +} + +variable "nomad_provider_ca_file" { + description = "A local file path to a PEM-encoded certificate authority." + type = string + default = "/etc/nomad.d/ssl/nomad-ca.pem" +} + +variable "nomad_provider_cert_file" { + description = "A local file path to a PEM-encoded certificate." + type = string + default = "/etc/nomad.d/ssl/nomad-cli.pem" +} + +variable "nomad_provider_key_file" { + description = "A local file path to a PEM-encoded private key." + type = string + default = "/etc/nomad.d/ssl/nomad-cli-key.pem" +} + +variable "vault_provider_address" { + description = "Vault cluster address." + type = string + default = "http://10.30.51.28:8200" +} + +variable "vault_provider_skip_tls_verify" { + description = "Verification of the Vault server's TLS certificate." + type = bool + default = false +} + +variable "vault_provider_token" { + description = "Vault root token." + type = string + sensitive = true +}
\ No newline at end of file diff --git a/fdio.infra.terraform/terraform-nomad-alertmanager/fdio/versions.tf b/fdio.infra.terraform/terraform-nomad-alertmanager/fdio/versions.tf new file mode 100644 index 0000000000..385c5c3f18 --- /dev/null +++ b/fdio.infra.terraform/terraform-nomad-alertmanager/fdio/versions.tf @@ -0,0 +1,17 @@ +terraform { + backend "consul" { + address = "10.32.8.14:8500" + scheme = "http" + path = "terraform/alertmanager" + } + required_providers { + nomad = { + source = "hashicorp/nomad" + version = ">= 1.4.16" + } + vault = { + version = ">= 3.2.1" + } + } + required_version = ">= 1.1.4" +}
\ No newline at end of file diff --git a/fdio.infra.terraform/terraform-nomad-alertmanager/main.tf b/fdio.infra.terraform/terraform-nomad-alertmanager/main.tf new file mode 100644 index 0000000000..e8a1389150 --- /dev/null +++ b/fdio.infra.terraform/terraform-nomad-alertmanager/main.tf @@ -0,0 +1,48 @@ +locals { + datacenters = join(",", var.datacenters) + url = join("", + [ + "https://github.com", + "/prometheus/alertmanager/releases/download/", + "v${var.am_version}/", + "alertmanager-${var.am_version}.linux-amd64.tar.gz" + ] + ) +} + +resource "nomad_job" "nomad_job_alertmanager" { + jobspec = templatefile( + "${path.module}/conf/nomad/alertmanager.hcl.tftpl", + { + auto_promote = var.auto_promote, + auto_revert = var.auto_revert, + canary = var.canary, + cpu = var.cpu, + datacenters = local.datacenters, + group_count = var.group_count, + job_name = var.job_name, + max_parallel = var.max_parallel, + memory = var.memory + port = var.port, + region = var.region, + service_name = var.service_name, + slack_jenkins_api_key = var.slack_jenkins_api_key, + slack_jenkins_channel = var.slack_jenkins_channel, + slack_jenkins_receiver = var.slack_jenkins_receiver, + slack_default_api_key = var.slack_default_api_key, + slack_default_channel = var.slack_default_channel, + slack_default_receiver = var.slack_default_receiver, + url = local.url, + use_canary = var.use_canary, + use_host_volume = var.use_host_volume, + use_vault_provider = var.vault_secret.use_vault_provider, + vault_kv_policy_name = var.vault_secret.vault_kv_policy_name, + vault_kv_path = var.vault_secret.vault_kv_path, + vault_kv_field_access_key = var.vault_secret.vault_kv_field_access_key, + vault_kv_field_secret_key = var.vault_secret.vault_kv_field_secret_key, + version = var.am_version, + volume_destination = var.volume_destination, + volume_source = var.volume_source + }) + detach = false +} diff --git a/fdio.infra.terraform/terraform-nomad-alertmanager/variables.tf b/fdio.infra.terraform/terraform-nomad-alertmanager/variables.tf new file mode 100644 index 0000000000..e452598fa6 --- /dev/null +++ b/fdio.infra.terraform/terraform-nomad-alertmanager/variables.tf @@ -0,0 +1,157 @@ +# Nomad +variable "datacenters" { + description = "Specifies the list of DCs to be considered placing this task" + type = list(string) + default = ["dc1"] +} + +variable "region" { + description = "Specifies the list of DCs to be considered placing this task" + type = string + default = "global" +} + +variable "volume_source" { + description = "The name of the volume to request" + type = string + default = "persistence" +} + +# Alertmanager +variable "am_version" { + description = "Alertmanager version" + type = string + default = "0.21.0" +} + +variable "auto_promote" { + description = "Specifies if the job should auto-promote to the canary version" + type = bool + default = true +} + +variable "auto_revert" { + description = "Specifies if the job should auto-revert to the last stable job" + type = bool + default = true +} + +variable "canary" { + description = "Equal to the count of the task group allows blue/green depl." + type = number + default = 1 +} + +variable "cpu" { + description = "CPU allocation" + type = number + default = 1000 +} + +variable "group_count" { + description = "Specifies the number of the task groups running under this one" + type = number + default = 1 +} + +variable "job_name" { + description = "Specifies a name for the job" + type = string + default = "alertmanager" +} + +variable "max_parallel" { + description = "Specifies the maximum number of updates to perform in parallel" + type = number + default = 1 +} + +variable "memory" { + description = "Specifies the memory required in MB" + type = number + default = 1024 +} + +variable "port" { + description = "Specifies the static TCP/UDP port to allocate" + type = number + default = 9093 +} + +variable "service_name" { + description = "Specifies the name this service will be advertised in Consul" + type = string + default = "alertmanager" +} + +variable "use_canary" { + description = "Uses canary deployment" + type = bool + default = true +} + +variable "use_host_volume" { + description = "Use Nomad host volume feature" + type = bool + default = false +} + +variable "vault_secret" { + type = object({ + use_vault_provider = bool, + vault_kv_policy_name = string, + vault_kv_path = string, + vault_kv_field_access_key = string, + vault_kv_field_secret_key = string + }) + description = "Set of properties to be able to fetch secret from vault." + default = { + use_vault_provider = false + vault_kv_policy_name = "kv" + vault_kv_path = "secret/data/alertmanager" + vault_kv_field_access_key = "access_key" + vault_kv_field_secret_key = "secret_key" + } +} + +variable "volume_destination" { + description = "Specifies where the volume should be mounted inside the task" + type = string + default = "/data/" +} + +variable "slack_jenkins_api_key" { + description = "Alertmanager jenkins slack API key" + type = string + default = "XXXXXXXXX/XXXXXXXXXXX/XXXXXXXXXXXXXXXXXXXXXXXX" +} + +variable "slack_jenkins_receiver" { + description = "Alertmanager jenkins slack receiver" + type = string + default = "jenkins-slack-receiver" +} + +variable "slack_jenkins_channel" { + description = "Alertmanager jenkins slack channel" + type = string + default = "jenkins-channel" +} + +variable "slack_default_api_key" { + description = "Alertmanager default slack API key" + type = string + default = "XXXXXXXXX/XXXXXXXXXXX/XXXXXXXXXXXXXXXXXXXXXXXX" +} + +variable "slack_default_receiver" { + description = "Alertmanager default slack receiver" + type = string + default = "default-slack-receiver" +} + +variable "slack_default_channel" { + description = "Alertmanager default slack channel" + type = string + default = "default-channel" +}
\ No newline at end of file diff --git a/fdio.infra.terraform/terraform-nomad-alertmanager/versions.tf b/fdio.infra.terraform/terraform-nomad-alertmanager/versions.tf new file mode 100644 index 0000000000..5f283ed4ea --- /dev/null +++ b/fdio.infra.terraform/terraform-nomad-alertmanager/versions.tf @@ -0,0 +1,9 @@ +terraform { + required_providers { + nomad = { + source = "hashicorp/nomad" + version = ">= 1.4.16" + } + } + required_version = ">= 1.1.4" +}
\ No newline at end of file |