aboutsummaryrefslogtreecommitdiffstats
path: root/terraform-ci-infra/1n_nmd/alertmanager
diff options
context:
space:
mode:
authorpmikus <pmikus@cisco.com>2021-01-07 14:27:38 +0000
committerpmikus <pmikus@cisco.com>2021-01-29 08:44:37 +0000
commita44eef233de959c5679602dc6cd1ed866d6abc14 (patch)
tree2c4d0cd5e0d1293ae86a09cf26009b869c72360d /terraform-ci-infra/1n_nmd/alertmanager
parent474163bc78aa418595b227c81056987b1114104d (diff)
Infra: Monitoring capability
+ Monitoring SOA + Nomad alertmanager job + Nomad prometheus job + Nomad grafana job Signed-off-by: pmikus <pmikus@cisco.com> Change-Id: I0b32e9c87276ba1a2d4a5322816f3473c737eae2
Diffstat (limited to 'terraform-ci-infra/1n_nmd/alertmanager')
-rw-r--r--terraform-ci-infra/1n_nmd/alertmanager/conf/nomad/alertmanager.hcl333
-rw-r--r--terraform-ci-infra/1n_nmd/alertmanager/main.tf37
-rw-r--r--terraform-ci-infra/1n_nmd/alertmanager/variables.tf84
3 files changed, 454 insertions, 0 deletions
diff --git a/terraform-ci-infra/1n_nmd/alertmanager/conf/nomad/alertmanager.hcl b/terraform-ci-infra/1n_nmd/alertmanager/conf/nomad/alertmanager.hcl
new file mode 100644
index 0000000000..4560cf07ab
--- /dev/null
+++ b/terraform-ci-infra/1n_nmd/alertmanager/conf/nomad/alertmanager.hcl
@@ -0,0 +1,333 @@
+job "${job_name}" {
+ # The "region" parameter specifies the region in which to execute the job.
+ # If omitted, this inherits the default region name of "global".
+ # region = "global"
+ #
+ # The "datacenters" parameter specifies the list of datacenters which should
+ # be considered when placing this task. This must be provided.
+ datacenters = "${datacenters}"
+
+ # The "type" parameter controls the type of job, which impacts the scheduler's
+ # decision on placement. This configuration is optional and defaults to
+ # "service". For a full list of job types and their differences, please see
+ # the online documentation.
+ #
+ # For more information, please see the online documentation at:
+ #
+ # https://www.nomadproject.io/docs/jobspec/schedulers
+ #
+ type = "service"
+
+ update {
+ # The "max_parallel" parameter specifies the maximum number of updates to
+ # perform in parallel. In this case, this specifies to update a single task
+ # at a time.
+ max_parallel = 1
+
+ health_check = "checks"
+
+ # The "min_healthy_time" parameter specifies the minimum time the allocation
+ # must be in the healthy state before it is marked as healthy and unblocks
+ # further allocations from being updated.
+ min_healthy_time = "10s"
+
+ # The "healthy_deadline" parameter specifies the deadline in which the
+ # allocation must be marked as healthy after which the allocation is
+ # automatically transitioned to unhealthy. Transitioning to unhealthy will
+ # fail the deployment and potentially roll back the job if "auto_revert" is
+ # set to true.
+ healthy_deadline = "3m"
+
+ # The "progress_deadline" parameter specifies the deadline in which an
+ # allocation must be marked as healthy. The deadline begins when the first
+ # allocation for the deployment is created and is reset whenever an allocation
+ # as part of the deployment transitions to a healthy state. If no allocation
+ # transitions to the healthy state before the progress deadline, the
+ # deployment is marked as failed.
+ progress_deadline = "10m"
+
+%{ if use_canary }
+ # The "canary" parameter specifies that changes to the job that would result
+ # in destructive updates should create the specified number of canaries
+ # without stopping any previous allocations. Once the operator determines the
+ # canaries are healthy, they can be promoted which unblocks a rolling update
+ # of the remaining allocations at a rate of "max_parallel".
+ #
+ # Further, setting "canary" equal to the count of the task group allows
+ # blue/green deployments. When the job is updated, a full set of the new
+ # version is deployed and upon promotion the old version is stopped.
+ canary = 1
+
+ # Specifies if the job should auto-promote to the canary version when all
+ # canaries become healthy during a deployment. Defaults to false which means
+ # canaries must be manually updated with the nomad deployment promote
+ # command.
+ auto_promote = true
+
+ # The "auto_revert" parameter specifies if the job should auto-revert to the
+ # last stable job on deployment failure. A job is marked as stable if all the
+ # allocations as part of its deployment were marked healthy.
+ auto_revert = true
+%{ endif }
+ }
+
+ # The "group" stanza defines a series of tasks that should be co-located on
+ # the same Nomad client. Any task within a group will be placed on the same
+ # client.
+ #
+ # For more information and examples on the "group" stanza, please see
+ # the online documentation at:
+ #
+ # https://www.nomadproject.io/docs/job-specification/group
+ #
+ group "prod-group1-${service_name}" {
+ # The "count" parameter specifies the number of the task groups that should
+ # be running under this group. This value must be non-negative and defaults
+ # to 1.
+ count = ${group_count}
+
+ # The constraint allows restricting the set of eligible nodes. Constraints
+ # may filter on attributes or client metadata.
+ #
+ # For more information and examples on the "volume" stanza, please see
+ # the online documentation at:
+ #
+ # https://www.nomadproject.io/docs/job-specification/constraint
+ #
+ constraint {
+ attribute = "$${attr.cpu.arch}"
+ operator = "!="
+ value = "arm64"
+ }
+
+ # The "task" stanza creates an individual unit of work, such as a Docker
+ # container, web application, or batch processing.
+ #
+ # For more information and examples on the "task" stanza, please see
+ # the online documentation at:
+ #
+ # https://www.nomadproject.io/docs/job-specification/task
+ #
+ task "prod-task1-${service_name}" {
+ # The "driver" parameter specifies the task driver that should be used to
+ # run the task.
+ driver = "exec"
+
+ %{ if use_vault_provider }
+ vault {
+ policies = "${vault_kv_policy_name}"
+ }
+ %{ endif }
+
+ # The "config" stanza specifies the driver configuration, which is passed
+ # directly to the driver to start the task. The details of configurations
+ # are specific to each driver, so please see specific driver
+ # documentation for more information.
+ config {
+ command = "local/alertmanager-${version}.linux-amd64/alertmanager"
+ args = [
+ "--config.file=secrets/alertmanager.yml"
+ ]
+ }
+
+ # The artifact stanza instructs Nomad to fetch and unpack a remote resource,
+ # such as a file, tarball, or binary. Nomad downloads artifacts using the
+ # popular go-getter library, which permits downloading artifacts from a
+ # variety of locations using a URL as the input source.
+ #
+ # For more information and examples on the "artifact" stanza, please see
+ # the online documentation at:
+ #
+ # https://www.nomadproject.io/docs/job-specification/artifact
+ #
+ artifact {
+ source = "${url}"
+ }
+
+ # The "template" stanza instructs Nomad to manage a template, such as
+ # a configuration file or script. This template can optionally pull data
+ # from Consul or Vault to populate runtime configuration data.
+ #
+ # For more information and examples on the "template" stanza, please see
+ # the online documentation at:
+ #
+ # https://www.nomadproject.io/docs/job-specification/template
+ #
+ template {
+ change_mode = "noop"
+ change_signal = "SIGINT"
+ destination = "secrets/alertmanager.yml"
+ left_delimiter = "{{{"
+ right_delimiter = "}}}"
+ data = <<EOH
+global:
+ # The API URL to use for Slack notifications.
+ slack_api_url: '${slack_api_url}'
+
+# The directory from which notification templates are read.
+templates:
+- '/etc/alertmanager/template/*.tmpl'
+
+#tls_config:
+# # CA certificate to validate the server certificate with.
+# ca_file: <filepath> ]
+#
+# # Certificate and key files for client cert authentication to the server.
+# cert_file: <filepath>
+# key_file: <filepath>
+#
+# # ServerName extension to indicate the name of the server.
+# # http://tools.ietf.org/html/rfc4366#section-3.1
+# server_name: <string>
+#
+# # Disable validation of the server certificate.
+# insecure_skip_verify: true
+
+# The root route on which each incoming alert enters.
+route:
+ receiver: '${default_receiver}'
+
+ # The labels by which incoming alerts are grouped together. For example,
+ # multiple alerts coming in for cluster=A and alertname=LatencyHigh would
+ # be batched into a single group.
+ #
+ # To aggregate by all possible labels use '...' as the sole label name.
+ # This effectively disables aggregation entirely, passing through all
+ # alerts as-is. This is unlikely to be what you want, unless you have
+ # a very low alert volume or your upstream notification system performs
+ # its own grouping. Example: group_by: [...]
+ group_by: ['alertname', 'cluster', 'service']
+
+ # When a new group of alerts is created by an incoming alert, wait at
+ # least 'group_wait' to send the initial notification.
+ # This way ensures that you get multiple alerts for the same group that start
+ # firing shortly after another are batched together on the first
+ # notification.
+ group_wait: 30s
+
+ # When the first notification was sent, wait 'group_interval' to send a batch
+ # of new alerts that started firing for that group.
+ group_interval: 5m
+
+ # If an alert has successfully been sent, wait 'repeat_interval' to
+ # resend them.
+ repeat_interval: 3h
+
+ # All the above attributes are inherited by all child routes and can
+ # overwritten on each.
+ # The child route trees.
+ routes:
+ # This routes performs a regular expression match on alert labels to
+ # catch alerts that are related to a list of services.
+ - match_re:
+ service: .*
+ receiver: ${default_receiver}
+ # The service has a sub-route for critical alerts, any alerts
+ # that do not match, i.e. severity != critical, fall-back to the
+ # parent node and are sent to 'team-X-mails'
+ routes:
+ - match:
+ severity: critical
+ receiver: '${default_receiver}'
+
+# Inhibition rules allow to mute a set of alerts given that another alert is
+# firing.
+# We use this to mute any warning-level notifications if the same alert is
+# already critical.
+inhibit_rules:
+- source_match:
+ severity: 'critical'
+ target_match:
+ severity: 'warning'
+ # Apply inhibition if the alertname is the same.
+ # CAUTION:
+ # If all label names listed in `equal` are missing
+ # from both the source and target alerts,
+ # the inhibition rule will apply!
+ equal: ['alertname', 'cluster', 'service']
+
+receivers:
+- name: '${default_receiver}'
+ slack_configs:
+ - channel: '#${slack_channel}'
+ send_resolved: true
+ icon_url: https://avatars3.githubusercontent.com/u/3380462
+ title: |-
+ [{{ .Status | toUpper }}{{ if eq .Status "firing" }}:{{ .Alerts.Firing | len }}{{ end }}] {{ .CommonLabels.alertname }} for {{ .CommonLabels.job }}
+ {{- if gt (len .CommonLabels) (len .GroupLabels) -}}
+ {{" "}}(
+ {{- with .CommonLabels.Remove .GroupLabels.Names }}
+ {{- range $index, $label := .SortedPairs -}}
+ {{ if $index }}, {{ end }}
+ {{- $label.Name }}="{{ $label.Value -}}"
+ {{- end }}
+ {{- end -}}
+ )
+ {{- end }}
+ text: >-
+ {{ range .Alerts -}}
+ *Alert:* {{ .Annotations.summary }}{{ if .Labels.severity }} - `{{ .Labels.severity }}`{{ end }}
+
+ *Description:* {{ .Annotations.description }}
+
+ *Details:*
+ {{ range .Labels.SortedPairs }} • *{{ .Name }}:* `{{ .Value }}`
+ {{ end }}
+ {{ end }}
+EOH
+ }
+
+ # The service stanza instructs Nomad to register a service with Consul.
+ #
+ # For more information and examples on the "task" stanza, please see
+ # the online documentation at:
+ #
+ # https://www.nomadproject.io/docs/job-specification/service
+ #
+ service {
+ name = "${service_name}"
+ port = "${service_name}"
+ tags = [ "${service_name}$${NOMAD_ALLOC_INDEX}" ]
+ check {
+ name = "Alertmanager Check Live"
+ type = "http"
+ path = "/-/healthy"
+ interval = "10s"
+ timeout = "2s"
+ }
+ }
+
+ # The "resources" stanza describes the requirements a task needs to
+ # execute. Resource requirements include memory, network, cpu, and more.
+ # This ensures the task will execute on a machine that contains enough
+ # resource capacity.
+ #
+ # For more information and examples on the "resources" stanza, please see
+ # the online documentation at:
+ #
+ # https://www.nomadproject.io/docs/job-specification/resources
+ #
+ resources {
+ cpu = ${cpu}
+ memory = ${mem}
+ # The network stanza specifies the networking requirements for the task
+ # group, including the network mode and port allocations. When scheduling
+ # jobs in Nomad they are provisioned across your fleet of machines along
+ # with other jobs and services. Because you don't know in advance what host
+ # your job will be provisioned on, Nomad will provide your tasks with
+ # network configuration when they start up.
+ #
+ # For more information and examples on the "template" stanza, please see
+ # the online documentation at:
+ #
+ # https://www.nomadproject.io/docs/job-specification/network
+ #
+ network {
+ port "${service_name}" {
+ static = ${port}
+ }
+ }
+ }
+ }
+ }
+} \ No newline at end of file
diff --git a/terraform-ci-infra/1n_nmd/alertmanager/main.tf b/terraform-ci-infra/1n_nmd/alertmanager/main.tf
new file mode 100644
index 0000000000..411c78a558
--- /dev/null
+++ b/terraform-ci-infra/1n_nmd/alertmanager/main.tf
@@ -0,0 +1,37 @@
+locals {
+ datacenters = join(",", var.nomad_datacenters)
+
+ alertmanager_url = join("",
+ [
+ "https://github.com",
+ "/prometheus/alertmanager/releases/download/",
+ "v${var.alertmanager_version}/",
+ "alertmanager-${var.alertmanager_version}.linux-amd64.tar.gz"
+ ]
+ )
+}
+
+data "template_file" "nomad_job_alertmanager" {
+ template = file("${path.module}/conf/nomad/alertmanager.hcl")
+ vars = {
+ datacenters = local.datacenters
+ url = local.alertmanager_url
+ job_name = var.alertmanager_job_name
+ use_canary = var.alertmanager_use_canary
+ group_count = var.alertmanager_group_count
+ service_name = var.alertmanager_service_name
+ use_vault_provider = var.alertmanager_vault_secret.use_vault_provider
+ version = var.alertmanager_version
+ cpu = var.alertmanager_cpu
+ mem = var.alertmanager_mem
+ port = var.alertmanager_port
+ slack_api_url = var.alertmanager_slack_api_url
+ slack_channel = var.alertmanager_slack_channel
+ default_receiver = var.alertmanager_default_receiver
+ }
+}
+
+resource "nomad_job" "nomad_job_alertmanager" {
+ jobspec = data.template_file.nomad_job_alertmanager.rendered
+ detach = false
+} \ No newline at end of file
diff --git a/terraform-ci-infra/1n_nmd/alertmanager/variables.tf b/terraform-ci-infra/1n_nmd/alertmanager/variables.tf
new file mode 100644
index 0000000000..ebd862123d
--- /dev/null
+++ b/terraform-ci-infra/1n_nmd/alertmanager/variables.tf
@@ -0,0 +1,84 @@
+# Nomad
+variable "nomad_datacenters" {
+ description = "Nomad data centers"
+ type = list(string)
+ default = [ "dc1" ]
+}
+
+# Alermanager
+variable "alertmanager_job_name" {
+ description = "Job name"
+ type = string
+ default = "alertmanager"
+}
+
+variable "alertmanager_group_count" {
+ description = "Number of group instances"
+ type = number
+ default = 1
+}
+
+variable "alertmanager_service_name" {
+ description = "Service name"
+ type = string
+ default = "alertmanager"
+}
+
+variable "alertmanager_version" {
+ description = "Version"
+ type = string
+ default = "0.21.0"
+}
+
+variable "alertmanager_use_canary" {
+ description = "Uses canary deployment"
+ type = bool
+ default = false
+}
+
+variable "alertmanager_vault_secret" {
+ description = "Set of properties to be able to fetch secret from vault"
+ type = object({
+ use_vault_provider = bool,
+ vault_kv_policy_name = string,
+ vault_kv_path = string,
+ vault_kv_field_access_key = string,
+ vault_kv_field_secret_key = string
+ })
+}
+
+variable "alertmanager_cpu" {
+ description = "CPU allocation"
+ type = number
+ default = 1000
+}
+
+variable "alertmanager_mem" {
+ description = "RAM allocation"
+ type = number
+ default = 1024
+}
+
+variable "alertmanager_port" {
+ description = "TCP allocation"
+ type = number
+ default = 9093
+}
+
+variable "alertmanager_default_receiver" {
+ description = "Alertmanager default receiver"
+ type = string
+ default = "default-receiver"
+}
+
+variable "alertmanager_slack_api_url" {
+ description = "Alertmanager slack API URL"
+ type = string
+ default = "https://hooks.slack.com/services/XXXXXXXXX/XXXXXXXXXXX/XXXXXXXXXXXXXXXXXXXXXXXX"
+}
+
+variable "alertmanager_slack_channel" {
+ description = "Alertmanager slack channel"
+ type = string
+ default = "slack-channel"
+} \ No newline at end of file