diff options
Diffstat (limited to 'fdio.infra.terraform/terraform-nomad-loki')
4 files changed, 437 insertions, 0 deletions
diff --git a/fdio.infra.terraform/terraform-nomad-loki/conf/nomad/loki.hcl.tftpl b/fdio.infra.terraform/terraform-nomad-loki/conf/nomad/loki.hcl.tftpl new file mode 100644 index 0000000000..7b38437566 --- /dev/null +++ b/fdio.infra.terraform/terraform-nomad-loki/conf/nomad/loki.hcl.tftpl @@ -0,0 +1,261 @@ +job "${job_name}" { + # The "region" parameter specifies the region in which to execute the job. + # If omitted, this inherits the default region name of "global". + # region = "${region}" + + # The "datacenters" parameter specifies the list of datacenters which should + # be considered when placing this task. This must be provided. + datacenters = "${datacenters}" + + # The "type" parameter controls the type of job, which impacts the scheduler's + # decision on placement. This configuration is optional and defaults to + # "service". For a full list of job types and their differences, please see + # the online documentation. + # + # https://www.nomadproject.io/docs/jobspec/schedulers + # + type = "service" + + update { + # The "max_parallel" parameter specifies the maximum number of updates to + # perform in parallel. In this case, this specifies to update a single task + # at a time. + max_parallel = ${max_parallel} + + health_check = "checks" + + # The "min_healthy_time" parameter specifies the minimum time the allocation + # must be in the healthy state before it is marked as healthy and unblocks + # further allocations from being updated. + min_healthy_time = "10s" + + # The "healthy_deadline" parameter specifies the deadline in which the + # allocation must be marked as healthy after which the allocation is + # automatically transitioned to unhealthy. Transitioning to unhealthy will + # fail the deployment and potentially roll back the job if "auto_revert" is + # set to true. + healthy_deadline = "3m" + + # The "progress_deadline" parameter specifies the deadline in which an + # allocation must be marked as healthy. The deadline begins when the first + # allocation for the deployment is created and is reset whenever an allocation + # as part of the deployment transitions to a healthy state. If no allocation + # transitions to the healthy state before the progress deadline, the + # deployment is marked as failed. + progress_deadline = "10m" + +%{ if use_canary } + # The "canary" parameter specifies that changes to the job that would result + # in destructive updates should create the specified number of canaries + # without stopping any previous allocations. Once the operator determines the + # canaries are healthy, they can be promoted which unblocks a rolling update + # of the remaining allocations at a rate of "max_parallel". + # + # Further, setting "canary" equal to the count of the task group allows + # blue/green deployments. When the job is updated, a full set of the new + # version is deployed and upon promotion the old version is stopped. + canary = ${canary} + + # Specifies if the job should auto-promote to the canary version when all + # canaries become healthy during a deployment. Defaults to false which means + # canaries must be manually updated with the nomad deployment promote + # command. + auto_promote = ${auto_promote} + + # The "auto_revert" parameter specifies if the job should auto-revert to the + # last stable job on deployment failure. A job is marked as stable if all the + # allocations as part of its deployment were marked healthy. + auto_revert = ${auto_revert} +%{ endif } + } + + # The "group" stanza defines a series of tasks that should be co-located on + # the same Nomad client. Any task within a group will be placed on the same + # client. + # + # https://www.nomadproject.io/docs/job-specification/group + # + group "${job_name}-group-1" { + # The "count" parameter specifies the number of the task groups that should + # be running under this group. This value must be non-negative and defaults + # to 1. + count = ${group_count} + + # The volume stanza allows the group to specify that it requires a given + # volume from the cluster. The key of the stanza is the name of the volume + # as it will be exposed to task configuration. + # + # https://www.nomadproject.io/docs/job-specification/volume + %{ if use_host_volume } + volume "${job_name}-volume-1" { + type = "host" + read_only = false + source = "${volume_source}" + } + %{ endif } + + # The restart stanza configures a tasks's behavior on task failure. Restarts + # happen on the client that is running the task. + # + # https://www.nomadproject.io/docs/job-specification/restart + # + restart { + interval = "30m" + attempts = 40 + delay = "15s" + mode = "delay" + } + + # The constraint allows restricting the set of eligible nodes. Constraints + # may filter on attributes or client metadata. + # + # https://www.nomadproject.io/docs/job-specification/constraint + # + constraint { + attribute = "$${attr.cpu.arch}" + operator = "!=" + value = "arm64" + } + + constraint { + attribute = "$${node.class}" + value = "builder" + } + + # The network stanza specifies the networking requirements for the task + # group, including the network mode and port allocations. When scheduling + # jobs in Nomad they are provisioned across your fleet of machines along + # with other jobs and services. Because you don't know in advance what host + # your job will be provisioned on, Nomad will provide your tasks with + # network configuration when they start up. + # + # https://www.nomadproject.io/docs/job-specification/network + # + network { + port "${service_name}" { + static = ${port} + to = ${port} + } + } + + # The "task" stanza creates an individual unit of work, such as a Docker + # container, web application, or batch processing. + # + # https://www.nomadproject.io/docs/job-specification/task + # + task "${job_name}-task-1" { + # The "driver" parameter specifies the task driver that should be used to + # run the task. + driver = "exec" + + %{ if use_host_volume } + volume_mount { + volume = "${job_name}-volume-1" + destination = "${volume_destination}" + read_only = false + } + %{ endif } + + %{ if use_vault_provider } + vault { + policies = "${vault_kv_policy_name}" + } + %{ endif } + + # The "config" stanza specifies the driver configuration, which is passed + # directly to the driver to start the task. The details of configurations + # are specific to each driver, so please see specific driver + # documentation for more information. + config { + command = "local/loki-linux-amd64" + } + + # The artifact stanza instructs Nomad to fetch and unpack a remote resource, + # such as a file, tarball, or binary. Nomad downloads artifacts using the + # popular go-getter library, which permits downloading artifacts from a + # variety of locations using a URL as the input source. + # + # https://www.nomadproject.io/docs/job-specification/artifact + # + artifact { + source = "${url}" + args = [ + "-config.file secrets/config.yml" + ] + } + + template { + change_mode = "noop" + change_signal = "SIGINT" + destination = "secrets/loki.yml" + data = <<EOH +--- +auth_enabled: false + +server: + http_listen_port: 3100 + http_listen_address: 127.0.0.1 + +schema_config: + configs: + - from: 2020-05-15 + store: boltdb + object_store: filesystem + schema: v11 + index: + prefix: index_ + period: 168h + +storage_config: + boltdb: + directory: /tmp/loki/index + + filesystem: + directory: /tmp/loki/chunks + + aws: + bucketnames: loki + endpoint: http://storage.service.consul:9000 + access_key_id: storage + secret_access_key: Storage1234 + insecure: false + sse_encryption: false + http_config: + idle_conn_timeout: 90s + response_header_timeout: 0s + insecure_skip_verify: false + s3forcepathstyle: true +EOH + } + + # The service stanza instructs Nomad to register a service with Consul. + # + # https://www.nomadproject.io/docs/job-specification/service + # + service { + name = "${service_name}" + port = "${service_name}" + tags = [ "${service_name}$${NOMAD_ALLOC_INDEX}" ] + check { + name = "Loki Check Live" + type = "http" + path = "/-/healthy" + interval = "10s" + timeout = "2s" + } + } + + # The "resources" stanza describes the requirements a task needs to + # execute. Resource requirements include memory, network, cpu, and more. + # This ensures the task will execute on a machine that contains enough + # resource capacity. + # + # https://www.nomadproject.io/docs/job-specification/resources + # + resources { + cpu = ${cpu} + memory = ${memory} + } + } + } +} diff --git a/fdio.infra.terraform/terraform-nomad-loki/main.tf b/fdio.infra.terraform/terraform-nomad-loki/main.tf new file mode 100644 index 0000000000..a2fc70d254 --- /dev/null +++ b/fdio.infra.terraform/terraform-nomad-loki/main.tf @@ -0,0 +1,40 @@ +locals { + datacenters = join(",", var.datacenters) + url = join("", + [ + "https://github.com", + "/grafana/loki/releases/download/v${var.gl_version}/loki-linux-amd64.zip" + ] + ) +} + +resource "nomad_job" "nomad_job_prometheus" { + jobspec = templatefile( + "${path.module}/conf/nomad/loki.hcl.tftpl", + { + auto_promote = var.auto_promote, + auto_revert = var.auto_revert, + canary = var.canary, + cpu = var.cpu, + datacenters = local.datacenters, + group_count = var.group_count, + job_name = var.job_name, + max_parallel = var.max_parallel, + memory = var.memory + port = var.port, + region = var.region, + service_name = var.service_name, + url = local.url, + use_canary = var.use_canary, + use_host_volume = var.use_host_volume, + use_vault_provider = var.vault_secret.use_vault_provider, + vault_kv_policy_name = var.vault_secret.vault_kv_policy_name, + vault_kv_path = var.vault_secret.vault_kv_path, + vault_kv_field_access_key = var.vault_secret.vault_kv_field_access_key, + vault_kv_field_secret_key = var.vault_secret.vault_kv_field_secret_key, + version = var.gl_version, + volume_destination = var.volume_destination, + volume_source = var.volume_source + }) + detach = false +} diff --git a/fdio.infra.terraform/terraform-nomad-loki/variables.tf b/fdio.infra.terraform/terraform-nomad-loki/variables.tf new file mode 100644 index 0000000000..049290f5a8 --- /dev/null +++ b/fdio.infra.terraform/terraform-nomad-loki/variables.tf @@ -0,0 +1,127 @@ +# Nomad +variable "datacenters" { + description = "Specifies the list of DCs to be considered placing this task" + type = list(string) + default = ["dc1"] +} + +variable "region" { + description = "Specifies the list of DCs to be considered placing this task" + type = string + default = "global" +} + +variable "volume_source" { + description = "The name of the volume to request" + type = string + default = "" +} + +# Grafana Loki +variable "gl_version" { + description = "Grafana Loki version" + type = string + default = "2.4.2" +} + +variable "auto_promote" { + description = "Specifies if the job should auto-promote to the canary version" + type = bool + default = true +} + +variable "auto_revert" { + description = "Specifies if the job should auto-revert to the last stable job" + type = bool + default = true +} + +variable "canary" { + description = "Equal to the count of the task group allows blue/green depl." + type = number + default = 1 +} + +variable "cpu" { + description = "CPU allocation" + type = number + default = 2000 +} + +variable "data_dir" { + description = "Loki data dir allocation" + type = string + default = "" +} + +variable "group_count" { + description = "Specifies the number of the task groups running under this one" + type = number + default = 1 +} + +variable "job_name" { + description = "Specifies a name for the job" + type = string + default = "loki" +} + +variable "max_parallel" { + description = "Specifies the maximum number of updates to perform in parallel" + type = number + default = 1 +} + +variable "memory" { + description = "Specifies the memory required in MB" + type = number + default = 4096 +} + +variable "port" { + description = "Specifies the static TCP/UDP port to allocate" + type = number + default = 3100 +} + +variable "service_name" { + description = "Specifies the name this service will be advertised in Consul" + type = string + default = "loki" +} + +variable "use_canary" { + description = "Uses canary deployment" + type = bool + default = true +} + +variable "use_host_volume" { + description = "Use Nomad host volume feature" + type = bool + default = false +} + +variable "volume_destination" { + description = "Specifies where the volume should be mounted inside the task" + type = string + default = "" +} + +variable "vault_secret" { + type = object({ + use_vault_provider = bool, + vault_kv_policy_name = string, + vault_kv_path = string, + vault_kv_field_access_key = string, + vault_kv_field_secret_key = string + }) + description = "Set of properties to be able to fetch secret from vault." + default = { + use_vault_provider = false + vault_kv_policy_name = "kv" + vault_kv_path = "secret/data/prometheus" + vault_kv_field_access_key = "access_key" + vault_kv_field_secret_key = "secret_key" + } +} diff --git a/fdio.infra.terraform/terraform-nomad-loki/versions.tf b/fdio.infra.terraform/terraform-nomad-loki/versions.tf new file mode 100644 index 0000000000..a01708f28a --- /dev/null +++ b/fdio.infra.terraform/terraform-nomad-loki/versions.tf @@ -0,0 +1,9 @@ +terraform { + required_providers { + nomad = { + source = "hashicorp/nomad" + version = ">= 1.4.16" + } + } + required_version = ">= 1.1.4" +} |