From ca81b6ccfdcee62846217f824c1e4a1610b8a950 Mon Sep 17 00:00:00 2001 From: Peter Mikus Date: Thu, 31 Mar 2022 15:59:10 +0200 Subject: refactor(terraform): Directory structure Signed-off-by: Peter Mikus Change-Id: I2f3cdf0241aaf7c4a8ba4e00b701df10c9165cf8 --- .../1n_aws_t3/fdio-csit-dash-app-base/main.tf | 11 - .../1n_aws_t3/fdio-csit-dash-app-base/output.tf | 0 .../1n_aws_t3/fdio-csit-dash-app-base/providers.tf | 11 - .../1n_aws_t3/fdio-csit-dash-app-base/variables.tf | 28 - .../1n_aws_t3/fdio-csit-dash-app-base/versions.tf | 17 - .../1n_aws_t3/fdio-csit-dash-env/main.tf | 106 ---- .../1n_aws_t3/fdio-csit-dash-env/output.tf | 4 - .../1n_aws_t3/fdio-csit-dash-env/providers.tf | 11 - .../1n_aws_t3/fdio-csit-dash-env/variables.tf | 28 - .../1n_aws_t3/fdio-csit-dash-env/versions.tf | 17 - .../alertmanager/conf/nomad/alertmanager.hcl.tftpl | 377 ------------- .../1n_nmd/alertmanager/fdio/main.tf | 14 - .../1n_nmd/alertmanager/fdio/providers.tf | 13 - .../1n_nmd/alertmanager/fdio/variables.tf | 47 -- .../1n_nmd/alertmanager/fdio/versions.tf | 17 - fdio.infra.terraform/1n_nmd/alertmanager/main.tf | 48 -- .../1n_nmd/alertmanager/variables.tf | 157 ------ .../1n_nmd/alertmanager/versions.tf | 9 - .../1n_nmd/etl/conf/nomad/etl.hcl.tftpl | 318 ----------- fdio.infra.terraform/1n_nmd/etl/fdio/main.tf | 23 - fdio.infra.terraform/1n_nmd/etl/fdio/providers.tf | 13 - fdio.infra.terraform/1n_nmd/etl/fdio/variables.tf | 47 -- fdio.infra.terraform/1n_nmd/etl/fdio/versions.tf | 17 - fdio.infra.terraform/1n_nmd/etl/main.tf | 33 -- fdio.infra.terraform/1n_nmd/etl/variables.tf | 115 ---- fdio.infra.terraform/1n_nmd/etl/versions.tf | 9 - fdio.infra.terraform/1n_nmd/main.tf | 89 --- .../1n_nmd/minio_s3_gateway/conf/nomad/minio.hcl | 246 -------- .../1n_nmd/minio_s3_gateway/main.tf | 51 -- .../1n_nmd/minio_s3_gateway/variables.tf | 199 ------- .../1n_nmd/minio_s3_gateway/versions.tf | 13 - .../prometheus/conf/nomad/prometheus.hcl.tftpl | 624 --------------------- .../1n_nmd/prometheus/fdio/main.tf | 10 - .../1n_nmd/prometheus/fdio/providers.tf | 13 - .../1n_nmd/prometheus/fdio/variables.tf | 47 -- .../1n_nmd/prometheus/fdio/versions.tf | 17 - fdio.infra.terraform/1n_nmd/prometheus/main.tf | 42 -- .../1n_nmd/prometheus/variables.tf | 127 ----- fdio.infra.terraform/1n_nmd/prometheus/versions.tf | 9 - .../1n_nmd/vault-aws-secret-backend/fdio/main.tf | 17 - .../vault-aws-secret-backend/fdio/providers.tf | 5 - .../vault-aws-secret-backend/fdio/variables.tf | 17 - .../vault-aws-secret-backend/fdio/versions.tf | 13 - .../1n_nmd/vault-aws-secret-backend/main.tf | 37 -- .../1n_nmd/vault-aws-secret-backend/variables.tf | 17 - .../1n_nmd/vault-aws-secret-backend/versions.tf | 8 - .../terraform-aws-fdio-csit-dash-app-base/main.tf | 11 + .../output.tf | 0 .../providers.tf | 11 + .../variables.tf | 28 + .../versions.tf | 17 + .../terraform-aws-fdio-csit-dash-env/main.tf | 106 ++++ .../terraform-aws-fdio-csit-dash-env/output.tf | 4 + .../terraform-aws-fdio-csit-dash-env/providers.tf | 11 + .../terraform-aws-fdio-csit-dash-env/variables.tf | 28 + .../terraform-aws-fdio-csit-dash-env/versions.tf | 17 + .../conf/nomad/alertmanager.hcl.tftpl | 377 +++++++++++++ .../terraform-nomad-alertmanager/fdio/main.tf | 14 + .../terraform-nomad-alertmanager/fdio/providers.tf | 13 + .../terraform-nomad-alertmanager/fdio/variables.tf | 47 ++ .../terraform-nomad-alertmanager/fdio/versions.tf | 17 + .../terraform-nomad-alertmanager/main.tf | 48 ++ .../terraform-nomad-alertmanager/variables.tf | 157 ++++++ .../terraform-nomad-alertmanager/versions.tf | 9 + .../terraform-nomad-loki/conf/nomad/loki.hcl.tftpl | 261 +++++++++ fdio.infra.terraform/terraform-nomad-loki/main.tf | 40 ++ .../terraform-nomad-loki/variables.tf | 127 +++++ .../terraform-nomad-loki/versions.tf | 9 + .../conf/nomad/prometheus.hcl.tftpl | 617 ++++++++++++++++++++ .../terraform-nomad-prometheus/fdio/main.tf | 10 + .../terraform-nomad-prometheus/fdio/providers.tf | 13 + .../terraform-nomad-prometheus/fdio/variables.tf | 47 ++ .../terraform-nomad-prometheus/fdio/versions.tf | 17 + .../terraform-nomad-prometheus/main.tf | 41 ++ .../terraform-nomad-prometheus/variables.tf | 127 +++++ .../terraform-nomad-prometheus/versions.tf | 9 + .../conf/nomad/etl.hcl.tftpl | 318 +++++++++++ .../terraform-nomad-pyspark-etl/fdio/main.tf | 23 + .../terraform-nomad-pyspark-etl/fdio/providers.tf | 13 + .../terraform-nomad-pyspark-etl/fdio/variables.tf | 47 ++ .../terraform-nomad-pyspark-etl/fdio/versions.tf | 17 + .../terraform-nomad-pyspark-etl/main.tf | 33 ++ .../terraform-nomad-pyspark-etl/variables.tf | 115 ++++ .../terraform-nomad-pyspark-etl/versions.tf | 9 + .../fdio/main.tf | 17 + .../fdio/providers.tf | 5 + .../fdio/variables.tf | 17 + .../fdio/versions.tf | 13 + .../terraform-vault-aws-secret-backend/main.tf | 37 ++ .../variables.tf | 17 + .../terraform-vault-aws-secret-backend/versions.tf | 8 + 91 files changed, 2922 insertions(+), 3091 deletions(-) delete mode 100644 fdio.infra.terraform/1n_aws_t3/fdio-csit-dash-app-base/main.tf delete mode 100644 fdio.infra.terraform/1n_aws_t3/fdio-csit-dash-app-base/output.tf delete mode 100644 fdio.infra.terraform/1n_aws_t3/fdio-csit-dash-app-base/providers.tf delete mode 100644 fdio.infra.terraform/1n_aws_t3/fdio-csit-dash-app-base/variables.tf delete mode 100644 fdio.infra.terraform/1n_aws_t3/fdio-csit-dash-app-base/versions.tf delete mode 100644 fdio.infra.terraform/1n_aws_t3/fdio-csit-dash-env/main.tf delete mode 100644 fdio.infra.terraform/1n_aws_t3/fdio-csit-dash-env/output.tf delete mode 100644 fdio.infra.terraform/1n_aws_t3/fdio-csit-dash-env/providers.tf delete mode 100644 fdio.infra.terraform/1n_aws_t3/fdio-csit-dash-env/variables.tf delete mode 100644 fdio.infra.terraform/1n_aws_t3/fdio-csit-dash-env/versions.tf delete mode 100644 fdio.infra.terraform/1n_nmd/alertmanager/conf/nomad/alertmanager.hcl.tftpl delete mode 100644 fdio.infra.terraform/1n_nmd/alertmanager/fdio/main.tf delete mode 100644 fdio.infra.terraform/1n_nmd/alertmanager/fdio/providers.tf delete mode 100644 fdio.infra.terraform/1n_nmd/alertmanager/fdio/variables.tf delete mode 100644 fdio.infra.terraform/1n_nmd/alertmanager/fdio/versions.tf delete mode 100644 fdio.infra.terraform/1n_nmd/alertmanager/main.tf delete mode 100644 fdio.infra.terraform/1n_nmd/alertmanager/variables.tf delete mode 100644 fdio.infra.terraform/1n_nmd/alertmanager/versions.tf delete mode 100644 fdio.infra.terraform/1n_nmd/etl/conf/nomad/etl.hcl.tftpl delete mode 100644 fdio.infra.terraform/1n_nmd/etl/fdio/main.tf delete mode 100644 fdio.infra.terraform/1n_nmd/etl/fdio/providers.tf delete mode 100644 fdio.infra.terraform/1n_nmd/etl/fdio/variables.tf delete mode 100644 fdio.infra.terraform/1n_nmd/etl/fdio/versions.tf delete mode 100644 fdio.infra.terraform/1n_nmd/etl/main.tf delete mode 100644 fdio.infra.terraform/1n_nmd/etl/variables.tf delete mode 100644 fdio.infra.terraform/1n_nmd/etl/versions.tf delete mode 100644 fdio.infra.terraform/1n_nmd/minio_s3_gateway/conf/nomad/minio.hcl delete mode 100644 fdio.infra.terraform/1n_nmd/minio_s3_gateway/main.tf delete mode 100644 fdio.infra.terraform/1n_nmd/minio_s3_gateway/variables.tf delete mode 100644 fdio.infra.terraform/1n_nmd/minio_s3_gateway/versions.tf delete mode 100644 fdio.infra.terraform/1n_nmd/prometheus/conf/nomad/prometheus.hcl.tftpl delete mode 100644 fdio.infra.terraform/1n_nmd/prometheus/fdio/main.tf delete mode 100644 fdio.infra.terraform/1n_nmd/prometheus/fdio/providers.tf delete mode 100644 fdio.infra.terraform/1n_nmd/prometheus/fdio/variables.tf delete mode 100644 fdio.infra.terraform/1n_nmd/prometheus/fdio/versions.tf delete mode 100644 fdio.infra.terraform/1n_nmd/prometheus/main.tf delete mode 100644 fdio.infra.terraform/1n_nmd/prometheus/variables.tf delete mode 100644 fdio.infra.terraform/1n_nmd/prometheus/versions.tf delete mode 100644 fdio.infra.terraform/1n_nmd/vault-aws-secret-backend/fdio/main.tf delete mode 100644 fdio.infra.terraform/1n_nmd/vault-aws-secret-backend/fdio/providers.tf delete mode 100644 fdio.infra.terraform/1n_nmd/vault-aws-secret-backend/fdio/variables.tf delete mode 100644 fdio.infra.terraform/1n_nmd/vault-aws-secret-backend/fdio/versions.tf delete mode 100644 fdio.infra.terraform/1n_nmd/vault-aws-secret-backend/main.tf delete mode 100644 fdio.infra.terraform/1n_nmd/vault-aws-secret-backend/variables.tf delete mode 100644 fdio.infra.terraform/1n_nmd/vault-aws-secret-backend/versions.tf create mode 100644 fdio.infra.terraform/terraform-aws-fdio-csit-dash-app-base/main.tf create mode 100644 fdio.infra.terraform/terraform-aws-fdio-csit-dash-app-base/output.tf create mode 100644 fdio.infra.terraform/terraform-aws-fdio-csit-dash-app-base/providers.tf create mode 100644 fdio.infra.terraform/terraform-aws-fdio-csit-dash-app-base/variables.tf create mode 100644 fdio.infra.terraform/terraform-aws-fdio-csit-dash-app-base/versions.tf create mode 100644 fdio.infra.terraform/terraform-aws-fdio-csit-dash-env/main.tf create mode 100644 fdio.infra.terraform/terraform-aws-fdio-csit-dash-env/output.tf create mode 100644 fdio.infra.terraform/terraform-aws-fdio-csit-dash-env/providers.tf create mode 100644 fdio.infra.terraform/terraform-aws-fdio-csit-dash-env/variables.tf create mode 100644 fdio.infra.terraform/terraform-aws-fdio-csit-dash-env/versions.tf create mode 100644 fdio.infra.terraform/terraform-nomad-alertmanager/conf/nomad/alertmanager.hcl.tftpl create mode 100644 fdio.infra.terraform/terraform-nomad-alertmanager/fdio/main.tf create mode 100644 fdio.infra.terraform/terraform-nomad-alertmanager/fdio/providers.tf create mode 100644 fdio.infra.terraform/terraform-nomad-alertmanager/fdio/variables.tf create mode 100644 fdio.infra.terraform/terraform-nomad-alertmanager/fdio/versions.tf create mode 100644 fdio.infra.terraform/terraform-nomad-alertmanager/main.tf create mode 100644 fdio.infra.terraform/terraform-nomad-alertmanager/variables.tf create mode 100644 fdio.infra.terraform/terraform-nomad-alertmanager/versions.tf create mode 100644 fdio.infra.terraform/terraform-nomad-loki/conf/nomad/loki.hcl.tftpl create mode 100644 fdio.infra.terraform/terraform-nomad-loki/main.tf create mode 100644 fdio.infra.terraform/terraform-nomad-loki/variables.tf create mode 100644 fdio.infra.terraform/terraform-nomad-loki/versions.tf create mode 100644 fdio.infra.terraform/terraform-nomad-prometheus/conf/nomad/prometheus.hcl.tftpl create mode 100644 fdio.infra.terraform/terraform-nomad-prometheus/fdio/main.tf create mode 100644 fdio.infra.terraform/terraform-nomad-prometheus/fdio/providers.tf create mode 100644 fdio.infra.terraform/terraform-nomad-prometheus/fdio/variables.tf create mode 100644 fdio.infra.terraform/terraform-nomad-prometheus/fdio/versions.tf create mode 100644 fdio.infra.terraform/terraform-nomad-prometheus/main.tf create mode 100644 fdio.infra.terraform/terraform-nomad-prometheus/variables.tf create mode 100644 fdio.infra.terraform/terraform-nomad-prometheus/versions.tf create mode 100644 fdio.infra.terraform/terraform-nomad-pyspark-etl/conf/nomad/etl.hcl.tftpl create mode 100644 fdio.infra.terraform/terraform-nomad-pyspark-etl/fdio/main.tf create mode 100644 fdio.infra.terraform/terraform-nomad-pyspark-etl/fdio/providers.tf create mode 100644 fdio.infra.terraform/terraform-nomad-pyspark-etl/fdio/variables.tf create mode 100644 fdio.infra.terraform/terraform-nomad-pyspark-etl/fdio/versions.tf create mode 100644 fdio.infra.terraform/terraform-nomad-pyspark-etl/main.tf create mode 100644 fdio.infra.terraform/terraform-nomad-pyspark-etl/variables.tf create mode 100644 fdio.infra.terraform/terraform-nomad-pyspark-etl/versions.tf create mode 100644 fdio.infra.terraform/terraform-vault-aws-secret-backend/fdio/main.tf create mode 100644 fdio.infra.terraform/terraform-vault-aws-secret-backend/fdio/providers.tf create mode 100644 fdio.infra.terraform/terraform-vault-aws-secret-backend/fdio/variables.tf create mode 100644 fdio.infra.terraform/terraform-vault-aws-secret-backend/fdio/versions.tf create mode 100644 fdio.infra.terraform/terraform-vault-aws-secret-backend/main.tf create mode 100644 fdio.infra.terraform/terraform-vault-aws-secret-backend/variables.tf create mode 100644 fdio.infra.terraform/terraform-vault-aws-secret-backend/versions.tf (limited to 'fdio.infra.terraform') diff --git a/fdio.infra.terraform/1n_aws_t3/fdio-csit-dash-app-base/main.tf b/fdio.infra.terraform/1n_aws_t3/fdio-csit-dash-app-base/main.tf deleted file mode 100644 index 63d30e4098..0000000000 --- a/fdio.infra.terraform/1n_aws_t3/fdio-csit-dash-app-base/main.tf +++ /dev/null @@ -1,11 +0,0 @@ -data "vault_aws_access_credentials" "creds" { - backend = "${var.vault_name}-path" - role = "${var.vault_name}-role" -} - -module "elastic_beanstalk_application_version" { - source = "../../terraform-aws-elastic-beanstalk-application-version" - application_description = "FD.io CSIT Results Dashboard" - application_name = "fdio-csit-dash-app" - application_version_name = "fdio-csit-dash-app-base" -} diff --git a/fdio.infra.terraform/1n_aws_t3/fdio-csit-dash-app-base/output.tf b/fdio.infra.terraform/1n_aws_t3/fdio-csit-dash-app-base/output.tf deleted file mode 100644 index e69de29bb2..0000000000 diff --git a/fdio.infra.terraform/1n_aws_t3/fdio-csit-dash-app-base/providers.tf b/fdio.infra.terraform/1n_aws_t3/fdio-csit-dash-app-base/providers.tf deleted file mode 100644 index 7241b27c16..0000000000 --- a/fdio.infra.terraform/1n_aws_t3/fdio-csit-dash-app-base/providers.tf +++ /dev/null @@ -1,11 +0,0 @@ -provider "aws" { - region = var.region - access_key = data.vault_aws_access_credentials.creds.access_key - secret_key = data.vault_aws_access_credentials.creds.secret_key -} - -provider "vault" { - address = var.vault_provider_address - skip_tls_verify = var.vault_provider_skip_tls_verify - token = var.vault_provider_token -} diff --git a/fdio.infra.terraform/1n_aws_t3/fdio-csit-dash-app-base/variables.tf b/fdio.infra.terraform/1n_aws_t3/fdio-csit-dash-app-base/variables.tf deleted file mode 100644 index 25790290bc..0000000000 --- a/fdio.infra.terraform/1n_aws_t3/fdio-csit-dash-app-base/variables.tf +++ /dev/null @@ -1,28 +0,0 @@ -variable "region" { - description = "AWS Region." - type = string - default = "us-east-1" -} - -variable "vault_provider_address" { - description = "Vault cluster address." - type = string - default = "http://10.30.51.28:8200" -} - -variable "vault_provider_skip_tls_verify" { - description = "Verification of the Vault server's TLS certificate." - type = bool - default = false -} - -variable "vault_provider_token" { - description = "Vault root token." - type = string - sensitive = true -} - -variable "vault_name" { - type = string - default = "dynamic-aws-creds-vault-fdio-csit-jenkins" -} diff --git a/fdio.infra.terraform/1n_aws_t3/fdio-csit-dash-app-base/versions.tf b/fdio.infra.terraform/1n_aws_t3/fdio-csit-dash-app-base/versions.tf deleted file mode 100644 index 4afbbc00a7..0000000000 --- a/fdio.infra.terraform/1n_aws_t3/fdio-csit-dash-app-base/versions.tf +++ /dev/null @@ -1,17 +0,0 @@ -terraform { - backend "consul" { - address = "10.32.8.14:8500" - scheme = "http" - path = "terraform/dash" - } - required_providers { - aws = { - source = "hashicorp/aws" - version = ">= 4.3.0" - } - vault = { - version = ">= 3.2.1" - } - } - required_version = ">= 1.1.4" -} diff --git a/fdio.infra.terraform/1n_aws_t3/fdio-csit-dash-env/main.tf b/fdio.infra.terraform/1n_aws_t3/fdio-csit-dash-env/main.tf deleted file mode 100644 index fa7bcea8d9..0000000000 --- a/fdio.infra.terraform/1n_aws_t3/fdio-csit-dash-env/main.tf +++ /dev/null @@ -1,106 +0,0 @@ -data "vault_generic_secret" "fdio_docs" { - path = "kv/secret/data/etl/fdio_docs" -} - -data "vault_aws_access_credentials" "creds" { - backend = "${var.vault_name}-path" - role = "${var.vault_name}-role" -} - -module "elastic_beanstalk_application" { - source = "../../terraform-aws-elastic-beanstalk-application" - - # application - application_description = "FD.io CSIT Results Dashboard" - application_name = "fdio-csit-dash-app" - appversion_lifecycle_service_role_arn = "" - appversion_lifecycle_max_count = 2 - appversion_lifecycle_delete_source_from_s3 = false -} - -module "elastic_beanstalk_environment" { - source = "../../terraform-aws-elastic-beanstalk-environment" - - # vpc - vpc_cidr_block = "192.168.0.0/24" - vpc_enable_dns_hostnames = true - vpc_enable_dns_support = true - vpc_instance_tenancy = "default" - - # subnet - subnet_availability_zone = "us-east-1a" - - # environment - environment_application = module.elastic_beanstalk_application.application_name - environment_description = module.elastic_beanstalk_application.application_description - environment_name = "fdio-csit-dash-env" - environment_solution_stack_name = "64bit Amazon Linux 2 v3.3.11 running Python 3.8" - environment_tier = "WebServer" - environment_wait_for_ready_timeout = "20m" - environment_version_label = "" - - # aws:ec2:instances - instances_instance_types = "t3a.xlarge" - - # aws:ec2:vpc - associate_public_ip_address = true - elb_scheme = "public" - - # aws:elbv2:listener:default - default_listener_enabled = true - - # aws:elasticbeanstalk:environment - environment_loadbalancer_type = "network" - - # aws:elasticbeanstalk:environment:process:default - environment_process_default_healthcheck_interval = 10 - environment_process_default_healthy_threshold_count = 3 - environment_process_default_port = 5000 - environment_process_default_unhealthy_threshold_count = 3 - - # aws:elasticbeanstalk:healthreporting:system - healthreporting_system_type = "enhanced" - - # aws:elasticbeanstalk:managedactions - managedactions_managed_actions_enabled = true - managedactions_preferred_start_time = "Sun:10:00" - - # aws:elasticbeanstalk:managedactions:platformupdate - managedactions_platformupdate_update_level = "minor" - managedactions_platformupdate_instance_refresh_enabled = true - - # aws:autoscaling:asg - autoscaling_asg_minsize = 1 - autoscaling_asg_maxsize = 2 - - # aws:autoscaling:trigger - autoscaling_trigger_measure_name = "CPUUtilization" - autoscaling_trigger_statistic = "Average" - autoscaling_trigger_unit = "Percent" - autoscaling_trigger_lower_threshold = 20 - autoscaling_trigger_lower_breach_scale_increment = -1 - autoscaling_trigger_upper_threshold = 80 - autoscaling_trigger_upper_breach_scale_increment = 1 - - # aws:elasticbeanstalk:hostmanager - hostmanager_log_publication_control = true - - # aws:elasticbeanstalk:cloudwatch:logs - cloudwatch_logs_stream_logs = true - cloudwatch_logs_delete_on_terminate = true - cloudwatch_logs_retention_in_days = 3 - - # aws:elasticbeanstalk:cloudwatch:logs:health - cloudwatch_logs_health_health_streaming_enabled = true - cloudwatch_logs_health_delete_on_terminate = true - cloudwatch_logs_health_retention_in_days = 3 - - environment_type = "LoadBalanced" - - # aws:elasticbeanstalk:application:environment - environment_variables = { - "AWS_ACCESS_KEY_ID" = data.vault_generic_secret.fdio_docs.data["access_key"] - "AWS_SECRET_ACCESS_KEY" = data.vault_generic_secret.fdio_docs.data["secret_key"] - "AWS_DEFAULT_REGION" = data.vault_generic_secret.fdio_docs.data["region"] - } -} diff --git a/fdio.infra.terraform/1n_aws_t3/fdio-csit-dash-env/output.tf b/fdio.infra.terraform/1n_aws_t3/fdio-csit-dash-env/output.tf deleted file mode 100644 index 094c8f5422..0000000000 --- a/fdio.infra.terraform/1n_aws_t3/fdio-csit-dash-env/output.tf +++ /dev/null @@ -1,4 +0,0 @@ -output "elastic_beanstalk_environment_hostname" { - description = "DNS hostname" - value = module.elastic_beanstalk_environment.environment_cname -} diff --git a/fdio.infra.terraform/1n_aws_t3/fdio-csit-dash-env/providers.tf b/fdio.infra.terraform/1n_aws_t3/fdio-csit-dash-env/providers.tf deleted file mode 100644 index 7241b27c16..0000000000 --- a/fdio.infra.terraform/1n_aws_t3/fdio-csit-dash-env/providers.tf +++ /dev/null @@ -1,11 +0,0 @@ -provider "aws" { - region = var.region - access_key = data.vault_aws_access_credentials.creds.access_key - secret_key = data.vault_aws_access_credentials.creds.secret_key -} - -provider "vault" { - address = var.vault_provider_address - skip_tls_verify = var.vault_provider_skip_tls_verify - token = var.vault_provider_token -} diff --git a/fdio.infra.terraform/1n_aws_t3/fdio-csit-dash-env/variables.tf b/fdio.infra.terraform/1n_aws_t3/fdio-csit-dash-env/variables.tf deleted file mode 100644 index 25790290bc..0000000000 --- a/fdio.infra.terraform/1n_aws_t3/fdio-csit-dash-env/variables.tf +++ /dev/null @@ -1,28 +0,0 @@ -variable "region" { - description = "AWS Region." - type = string - default = "us-east-1" -} - -variable "vault_provider_address" { - description = "Vault cluster address." - type = string - default = "http://10.30.51.28:8200" -} - -variable "vault_provider_skip_tls_verify" { - description = "Verification of the Vault server's TLS certificate." - type = bool - default = false -} - -variable "vault_provider_token" { - description = "Vault root token." - type = string - sensitive = true -} - -variable "vault_name" { - type = string - default = "dynamic-aws-creds-vault-fdio-csit-jenkins" -} diff --git a/fdio.infra.terraform/1n_aws_t3/fdio-csit-dash-env/versions.tf b/fdio.infra.terraform/1n_aws_t3/fdio-csit-dash-env/versions.tf deleted file mode 100644 index 4afbbc00a7..0000000000 --- a/fdio.infra.terraform/1n_aws_t3/fdio-csit-dash-env/versions.tf +++ /dev/null @@ -1,17 +0,0 @@ -terraform { - backend "consul" { - address = "10.32.8.14:8500" - scheme = "http" - path = "terraform/dash" - } - required_providers { - aws = { - source = "hashicorp/aws" - version = ">= 4.3.0" - } - vault = { - version = ">= 3.2.1" - } - } - required_version = ">= 1.1.4" -} diff --git a/fdio.infra.terraform/1n_nmd/alertmanager/conf/nomad/alertmanager.hcl.tftpl b/fdio.infra.terraform/1n_nmd/alertmanager/conf/nomad/alertmanager.hcl.tftpl deleted file mode 100644 index 87206ac5a0..0000000000 --- a/fdio.infra.terraform/1n_nmd/alertmanager/conf/nomad/alertmanager.hcl.tftpl +++ /dev/null @@ -1,377 +0,0 @@ -job "${job_name}" { - # The "region" parameter specifies the region in which to execute the job. - # If omitted, this inherits the default region name of "global". - # region = "${region}" - - # The "datacenters" parameter specifies the list of datacenters which should - # be considered when placing this task. This must be provided. - datacenters = "${datacenters}" - - # The "type" parameter controls the type of job, which impacts the scheduler's - # decision on placement. This configuration is optional and defaults to - # "service". For a full list of job types and their differences, please see - # the online documentation. - # - # https://www.nomadproject.io/docs/jobspec/schedulers - # - type = "service" - - update { - # The "max_parallel" parameter specifies the maximum number of updates to - # perform in parallel. In this case, this specifies to update a single task - # at a time. - max_parallel = ${max_parallel} - - health_check = "checks" - - # The "min_healthy_time" parameter specifies the minimum time the allocation - # must be in the healthy state before it is marked as healthy and unblocks - # further allocations from being updated. - min_healthy_time = "10s" - - # The "healthy_deadline" parameter specifies the deadline in which the - # allocation must be marked as healthy after which the allocation is - # automatically transitioned to unhealthy. Transitioning to unhealthy will - # fail the deployment and potentially roll back the job if "auto_revert" is - # set to true. - healthy_deadline = "3m" - - # The "progress_deadline" parameter specifies the deadline in which an - # allocation must be marked as healthy. The deadline begins when the first - # allocation for the deployment is created and is reset whenever an allocation - # as part of the deployment transitions to a healthy state. If no allocation - # transitions to the healthy state before the progress deadline, the - # deployment is marked as failed. - progress_deadline = "10m" - -%{ if use_canary } - # The "canary" parameter specifies that changes to the job that would result - # in destructive updates should create the specified number of canaries - # without stopping any previous allocations. Once the operator determines the - # canaries are healthy, they can be promoted which unblocks a rolling update - # of the remaining allocations at a rate of "max_parallel". - # - # Further, setting "canary" equal to the count of the task group allows - # blue/green deployments. When the job is updated, a full set of the new - # version is deployed and upon promotion the old version is stopped. - canary = ${canary} - - # Specifies if the job should auto-promote to the canary version when all - # canaries become healthy during a deployment. Defaults to false which means - # canaries must be manually updated with the nomad deployment promote - # command. - auto_promote = ${auto_promote} - - # The "auto_revert" parameter specifies if the job should auto-revert to the - # last stable job on deployment failure. A job is marked as stable if all the - # allocations as part of its deployment were marked healthy. - auto_revert = ${auto_revert} -%{ endif } - } - - # All groups in this job should be scheduled on different hosts. - constraint { - operator = "distinct_hosts" - value = "true" - } - - # The "group" stanza defines a series of tasks that should be co-located on - # the same Nomad client. Any task within a group will be placed on the same - # client. - # - # https://www.nomadproject.io/docs/job-specification/group - # - group "${job_name}-group-1" { - # The "count" parameter specifies the number of the task groups that should - # be running under this group. This value must be non-negative and defaults - # to 1. - count = ${group_count} - - # The volume stanza allows the group to specify that it requires a given - # volume from the cluster. The key of the stanza is the name of the volume - # as it will be exposed to task configuration. - # - # https://www.nomadproject.io/docs/job-specification/volume - %{ if use_host_volume } - volume "${job_name}-volume-1" { - type = "host" - read_only = false - source = "${volume_source}" - } - %{ endif } - - # The restart stanza configures a tasks's behavior on task failure. Restarts - # happen on the client that is running the task. - # - # https://www.nomadproject.io/docs/job-specification/restart - # - restart { - interval = "30m" - attempts = 40 - delay = "15s" - mode = "delay" - } - - # The constraint allows restricting the set of eligible nodes. Constraints - # may filter on attributes or client metadata. - # - # https://www.nomadproject.io/docs/job-specification/constraint - # - constraint { - attribute = "$${attr.cpu.arch}" - operator = "!=" - value = "arm64" - } - - constraint { - attribute = "$${node.class}" - value = "builder" - } - - # The network stanza specifies the networking requirements for the task - # group, including the network mode and port allocations. When scheduling - # jobs in Nomad they are provisioned across your fleet of machines along - # with other jobs and services. Because you don't know in advance what host - # your job will be provisioned on, Nomad will provide your tasks with - # network configuration when they start up. - # - # https://www.nomadproject.io/docs/job-specification/network - # - network { - port "${service_name}" { - static = ${port} - to = ${port} - } - } - - # The "task" stanza creates an individual unit of work, such as a Docker - # container, web application, or batch processing. - # - # https://www.nomadproject.io/docs/job-specification/task - # - task "${job_name}-task-1" { - # The "driver" parameter specifies the task driver that should be used to - # run the task. - driver = "exec" - - %{ if use_host_volume } - volume_mount { - volume = "${job_name}-volume-1" - destination = "${volume_destination}" - read_only = false - } - %{ endif } - - %{ if use_vault_provider } - vault { - policies = "${vault_kv_policy_name}" - } - %{ endif } - - # The "config" stanza specifies the driver configuration, which is passed - # directly to the driver to start the task. The details of configurations - # are specific to each driver, so please see specific driver - # documentation for more information. - config { - command = "local/alertmanager-${version}.linux-amd64/alertmanager" - args = [ - "--config.file=secrets/alertmanager.yml" - ] - } - - # The artifact stanza instructs Nomad to fetch and unpack a remote resource, - # such as a file, tarball, or binary. Nomad downloads artifacts using the - # popular go-getter library, which permits downloading artifacts from a - # variety of locations using a URL as the input source. - # - # https://www.nomadproject.io/docs/job-specification/artifact - # - artifact { - source = "${url}" - } - - # The "template" stanza instructs Nomad to manage a template, such as - # a configuration file or script. This template can optionally pull data - # from Consul or Vault to populate runtime configuration data. - # - # https://www.nomadproject.io/docs/job-specification/template - # - template { - change_mode = "noop" - change_signal = "SIGINT" - destination = "secrets/alertmanager.yml" - left_delimiter = "{{{" - right_delimiter = "}}}" - data = < ] -# -# # Certificate and key files for client cert authentication to the server. -# cert_file: -# key_file: -# -# # ServerName extension to indicate the name of the server. -# # http://tools.ietf.org/html/rfc4366#section-3.1 -# server_name: -# -# # Disable validation of the server certificate. -# insecure_skip_verify: true - -# The root route on which each incoming alert enters. -route: - receiver: '${slack_default_receiver}' - - # The labels by which incoming alerts are grouped together. For example, - # multiple alerts coming in for cluster=A and alertname=LatencyHigh would - # be batched into a single group. - # - # To aggregate by all possible labels use '...' as the sole label name. - # This effectively disables aggregation entirely, passing through all - # alerts as-is. This is unlikely to be what you want, unless you have - # a very low alert volume or your upstream notification system performs - # its own grouping. Example: group_by: [...] - group_by: ['alertname'] - - # When a new group of alerts is created by an incoming alert, wait at - # least 'group_wait' to send the initial notification. - # This way ensures that you get multiple alerts for the same group that start - # firing shortly after another are batched together on the first - # notification. - group_wait: 30s - - # When the first notification was sent, wait 'group_interval' to send a batch - # of new alerts that started firing for that group. - group_interval: 5m - - # If an alert has successfully been sent, wait 'repeat_interval' to - # resend them. - repeat_interval: 3h - - # All the above attributes are inherited by all child routes and can - # overwritten on each. - # The child route trees. - routes: - - match_re: - alertname: JenkinsJob.* - receiver: ${slack_jenkins_receiver} - routes: - - match: - severity: critical - receiver: '${slack_jenkins_receiver}' - - - match_re: - service: .* - receiver: ${slack_default_receiver} - routes: - - match: - severity: critical - receiver: '${slack_default_receiver}' - -# Inhibition rules allow to mute a set of alerts given that another alert is -# firing. -# We use this to mute any warning-level notifications if the same alert is -# already critical. -inhibit_rules: -- source_match: - severity: 'critical' - target_match: - severity: 'warning' - equal: ['alertname', 'instance'] - -receivers: -- name: '${slack_jenkins_receiver}' - slack_configs: - - api_url: 'https://hooks.slack.com/services/${slack_jenkins_api_key}' - channel: '#${slack_jenkins_channel}' - send_resolved: true - icon_url: https://avatars3.githubusercontent.com/u/3380462 - title: |- - [{{ .Status | toUpper }}{{ if eq .Status "firing" }}:{{ .Alerts.Firing | len }}{{ end }}] {{ .CommonLabels.alertname }} for {{ .CommonLabels.job }} - {{- if gt (len .CommonLabels) (len .GroupLabels) -}} - {{" "}}( - {{- with .CommonLabels.Remove .GroupLabels.Names }} - {{- range $index, $label := .SortedPairs -}} - {{ if $index }}, {{ end }} - {{- $label.Name }}="{{ $label.Value -}}" - {{- end }} - {{- end -}} - ) - {{- end }} - text: >- - {{ range .Alerts -}} - *Alert:* {{ .Annotations.summary }}{{ if .Labels.severity }} - `{{ .Labels.severity }}`{{ end }} - - *Description:* {{ .Annotations.description }} - - *Details:* - {{ range .Labels.SortedPairs }} • *{{ .Name }}:* `{{ .Value }}` - {{ end }} - {{ end }} - -- name: '${slack_default_receiver}' - slack_configs: - - api_url: 'https://hooks.slack.com/services/${slack_default_api_key}' - channel: '#${slack_default_channel}' - send_resolved: true - icon_url: https://avatars3.githubusercontent.com/u/3380462 - title: |- - [{{ .Status | toUpper }}{{ if eq .Status "firing" }}:{{ .Alerts.Firing | len }}{{ end }}] {{ .CommonLabels.alertname }} for {{ .CommonLabels.job }} - {{- if gt (len .CommonLabels) (len .GroupLabels) -}} - {{" "}}( - {{- with .CommonLabels.Remove .GroupLabels.Names }} - {{- range $index, $label := .SortedPairs -}} - {{ if $index }}, {{ end }} - {{- $label.Name }}="{{ $label.Value -}}" - {{- end }} - {{- end -}} - ) - {{- end }} - text: >- - {{ range .Alerts -}} - *Alert:* {{ .Annotations.summary }}{{ if .Labels.severity }} - `{{ .Labels.severity }}`{{ end }} - - *Description:* {{ .Annotations.description }} - - *Details:* - {{ range .Labels.SortedPairs }} • *{{ .Name }}:* `{{ .Value }}` - {{ end }} - {{ end }} -EOH - } - - # The service stanza instructs Nomad to register a service with Consul. - # - # https://www.nomadproject.io/docs/job-specification/service - # - service { - name = "${service_name}" - port = "${service_name}" - tags = [ "${service_name}$${NOMAD_ALLOC_INDEX}" ] - check { - name = "Alertmanager Check Live" - type = "http" - path = "/-/healthy" - interval = "10s" - timeout = "2s" - } - } - - # The "resources" stanza describes the requirements a task needs to - # execute. Resource requirements include memory, network, cpu, and more. - # This ensures the task will execute on a machine that contains enough - # resource capacity. - # - # https://www.nomadproject.io/docs/job-specification/resources - # - resources { - cpu = ${cpu} - memory = ${memory} - } - } - } -} diff --git a/fdio.infra.terraform/1n_nmd/alertmanager/fdio/main.tf b/fdio.infra.terraform/1n_nmd/alertmanager/fdio/main.tf deleted file mode 100644 index 745e450a8c..0000000000 --- a/fdio.infra.terraform/1n_nmd/alertmanager/fdio/main.tf +++ /dev/null @@ -1,14 +0,0 @@ -module "alertmanager" { - providers = { - nomad = nomad.yul1 - } - source = "../" - - # alertmanager - datacenters = ["yul1"] - slack_jenkins_api_key = "TE07RD1V1/B01U1NV9HV3/hKZXJJ74g2JcISq4K3QC1eG9" - slack_jenkins_channel = "fdio-jobs-monitoring" - slack_default_api_key = "TE07RD1V1/B01UUK23B6C/hZTcCu42FUv8d6rtirHtcYIi" - slack_default_channel = "fdio-infra-monitoring" - am_version = "0.23.0" -} \ No newline at end of file diff --git a/fdio.infra.terraform/1n_nmd/alertmanager/fdio/providers.tf b/fdio.infra.terraform/1n_nmd/alertmanager/fdio/providers.tf deleted file mode 100644 index 42a6a45ce0..0000000000 --- a/fdio.infra.terraform/1n_nmd/alertmanager/fdio/providers.tf +++ /dev/null @@ -1,13 +0,0 @@ -provider "nomad" { - address = var.nomad_provider_address - alias = "yul1" - # ca_file = var.nomad_provider_ca_file - # cert_file = var.nomad_provider_cert_file - # key_file = var.nomad_provider_key_file -} - -provider "vault" { - address = var.vault_provider_address - skip_tls_verify = var.vault_provider_skip_tls_verify - token = var.vault_provider_token -} \ No newline at end of file diff --git a/fdio.infra.terraform/1n_nmd/alertmanager/fdio/variables.tf b/fdio.infra.terraform/1n_nmd/alertmanager/fdio/variables.tf deleted file mode 100644 index 7d5be09d21..0000000000 --- a/fdio.infra.terraform/1n_nmd/alertmanager/fdio/variables.tf +++ /dev/null @@ -1,47 +0,0 @@ -variable "nomad_acl" { - description = "Nomad ACLs enabled/disabled." - type = bool - default = false -} - -variable "nomad_provider_address" { - description = "FD.io Nomad cluster address." - type = string - default = "http://10.32.8.14:4646" -} - -variable "nomad_provider_ca_file" { - description = "A local file path to a PEM-encoded certificate authority." - type = string - default = "/etc/nomad.d/ssl/nomad-ca.pem" -} - -variable "nomad_provider_cert_file" { - description = "A local file path to a PEM-encoded certificate." - type = string - default = "/etc/nomad.d/ssl/nomad-cli.pem" -} - -variable "nomad_provider_key_file" { - description = "A local file path to a PEM-encoded private key." - type = string - default = "/etc/nomad.d/ssl/nomad-cli-key.pem" -} - -variable "vault_provider_address" { - description = "Vault cluster address." - type = string - default = "http://10.30.51.28:8200" -} - -variable "vault_provider_skip_tls_verify" { - description = "Verification of the Vault server's TLS certificate." - type = bool - default = false -} - -variable "vault_provider_token" { - description = "Vault root token." - type = string - sensitive = true -} \ No newline at end of file diff --git a/fdio.infra.terraform/1n_nmd/alertmanager/fdio/versions.tf b/fdio.infra.terraform/1n_nmd/alertmanager/fdio/versions.tf deleted file mode 100644 index 385c5c3f18..0000000000 --- a/fdio.infra.terraform/1n_nmd/alertmanager/fdio/versions.tf +++ /dev/null @@ -1,17 +0,0 @@ -terraform { - backend "consul" { - address = "10.32.8.14:8500" - scheme = "http" - path = "terraform/alertmanager" - } - required_providers { - nomad = { - source = "hashicorp/nomad" - version = ">= 1.4.16" - } - vault = { - version = ">= 3.2.1" - } - } - required_version = ">= 1.1.4" -} \ No newline at end of file diff --git a/fdio.infra.terraform/1n_nmd/alertmanager/main.tf b/fdio.infra.terraform/1n_nmd/alertmanager/main.tf deleted file mode 100644 index e8a1389150..0000000000 --- a/fdio.infra.terraform/1n_nmd/alertmanager/main.tf +++ /dev/null @@ -1,48 +0,0 @@ -locals { - datacenters = join(",", var.datacenters) - url = join("", - [ - "https://github.com", - "/prometheus/alertmanager/releases/download/", - "v${var.am_version}/", - "alertmanager-${var.am_version}.linux-amd64.tar.gz" - ] - ) -} - -resource "nomad_job" "nomad_job_alertmanager" { - jobspec = templatefile( - "${path.module}/conf/nomad/alertmanager.hcl.tftpl", - { - auto_promote = var.auto_promote, - auto_revert = var.auto_revert, - canary = var.canary, - cpu = var.cpu, - datacenters = local.datacenters, - group_count = var.group_count, - job_name = var.job_name, - max_parallel = var.max_parallel, - memory = var.memory - port = var.port, - region = var.region, - service_name = var.service_name, - slack_jenkins_api_key = var.slack_jenkins_api_key, - slack_jenkins_channel = var.slack_jenkins_channel, - slack_jenkins_receiver = var.slack_jenkins_receiver, - slack_default_api_key = var.slack_default_api_key, - slack_default_channel = var.slack_default_channel, - slack_default_receiver = var.slack_default_receiver, - url = local.url, - use_canary = var.use_canary, - use_host_volume = var.use_host_volume, - use_vault_provider = var.vault_secret.use_vault_provider, - vault_kv_policy_name = var.vault_secret.vault_kv_policy_name, - vault_kv_path = var.vault_secret.vault_kv_path, - vault_kv_field_access_key = var.vault_secret.vault_kv_field_access_key, - vault_kv_field_secret_key = var.vault_secret.vault_kv_field_secret_key, - version = var.am_version, - volume_destination = var.volume_destination, - volume_source = var.volume_source - }) - detach = false -} diff --git a/fdio.infra.terraform/1n_nmd/alertmanager/variables.tf b/fdio.infra.terraform/1n_nmd/alertmanager/variables.tf deleted file mode 100644 index e452598fa6..0000000000 --- a/fdio.infra.terraform/1n_nmd/alertmanager/variables.tf +++ /dev/null @@ -1,157 +0,0 @@ -# Nomad -variable "datacenters" { - description = "Specifies the list of DCs to be considered placing this task" - type = list(string) - default = ["dc1"] -} - -variable "region" { - description = "Specifies the list of DCs to be considered placing this task" - type = string - default = "global" -} - -variable "volume_source" { - description = "The name of the volume to request" - type = string - default = "persistence" -} - -# Alertmanager -variable "am_version" { - description = "Alertmanager version" - type = string - default = "0.21.0" -} - -variable "auto_promote" { - description = "Specifies if the job should auto-promote to the canary version" - type = bool - default = true -} - -variable "auto_revert" { - description = "Specifies if the job should auto-revert to the last stable job" - type = bool - default = true -} - -variable "canary" { - description = "Equal to the count of the task group allows blue/green depl." - type = number - default = 1 -} - -variable "cpu" { - description = "CPU allocation" - type = number - default = 1000 -} - -variable "group_count" { - description = "Specifies the number of the task groups running under this one" - type = number - default = 1 -} - -variable "job_name" { - description = "Specifies a name for the job" - type = string - default = "alertmanager" -} - -variable "max_parallel" { - description = "Specifies the maximum number of updates to perform in parallel" - type = number - default = 1 -} - -variable "memory" { - description = "Specifies the memory required in MB" - type = number - default = 1024 -} - -variable "port" { - description = "Specifies the static TCP/UDP port to allocate" - type = number - default = 9093 -} - -variable "service_name" { - description = "Specifies the name this service will be advertised in Consul" - type = string - default = "alertmanager" -} - -variable "use_canary" { - description = "Uses canary deployment" - type = bool - default = true -} - -variable "use_host_volume" { - description = "Use Nomad host volume feature" - type = bool - default = false -} - -variable "vault_secret" { - type = object({ - use_vault_provider = bool, - vault_kv_policy_name = string, - vault_kv_path = string, - vault_kv_field_access_key = string, - vault_kv_field_secret_key = string - }) - description = "Set of properties to be able to fetch secret from vault." - default = { - use_vault_provider = false - vault_kv_policy_name = "kv" - vault_kv_path = "secret/data/alertmanager" - vault_kv_field_access_key = "access_key" - vault_kv_field_secret_key = "secret_key" - } -} - -variable "volume_destination" { - description = "Specifies where the volume should be mounted inside the task" - type = string - default = "/data/" -} - -variable "slack_jenkins_api_key" { - description = "Alertmanager jenkins slack API key" - type = string - default = "XXXXXXXXX/XXXXXXXXXXX/XXXXXXXXXXXXXXXXXXXXXXXX" -} - -variable "slack_jenkins_receiver" { - description = "Alertmanager jenkins slack receiver" - type = string - default = "jenkins-slack-receiver" -} - -variable "slack_jenkins_channel" { - description = "Alertmanager jenkins slack channel" - type = string - default = "jenkins-channel" -} - -variable "slack_default_api_key" { - description = "Alertmanager default slack API key" - type = string - default = "XXXXXXXXX/XXXXXXXXXXX/XXXXXXXXXXXXXXXXXXXXXXXX" -} - -variable "slack_default_receiver" { - description = "Alertmanager default slack receiver" - type = string - default = "default-slack-receiver" -} - -variable "slack_default_channel" { - description = "Alertmanager default slack channel" - type = string - default = "default-channel" -} \ No newline at end of file diff --git a/fdio.infra.terraform/1n_nmd/alertmanager/versions.tf b/fdio.infra.terraform/1n_nmd/alertmanager/versions.tf deleted file mode 100644 index 5f283ed4ea..0000000000 --- a/fdio.infra.terraform/1n_nmd/alertmanager/versions.tf +++ /dev/null @@ -1,9 +0,0 @@ -terraform { - required_providers { - nomad = { - source = "hashicorp/nomad" - version = ">= 1.4.16" - } - } - required_version = ">= 1.1.4" -} \ No newline at end of file diff --git a/fdio.infra.terraform/1n_nmd/etl/conf/nomad/etl.hcl.tftpl b/fdio.infra.terraform/1n_nmd/etl/conf/nomad/etl.hcl.tftpl deleted file mode 100644 index 208fb0a59f..0000000000 --- a/fdio.infra.terraform/1n_nmd/etl/conf/nomad/etl.hcl.tftpl +++ /dev/null @@ -1,318 +0,0 @@ -job "${job_name}" { - # The "datacenters" parameter specifies the list of datacenters which should - # be considered when placing this task. This must be provided. - datacenters = "${datacenters}" - - # The "type" parameter controls the type of job, which impacts the scheduler's - # decision on placement. For a full list of job types and their differences, - # please see the online documentation. - # - # https://www.nomadproject.io/docs/jobspec/schedulers - # - type = "${type}" - - # The periodic stanza allows a job to run at fixed times, dates, or intervals. - # The easiest way to think about the periodic scheduler is "Nomad cron" or - # "distributed cron". - # - # https://www.nomadproject.io/docs/job-specification/periodic - # - periodic { - cron = "${cron}" - prohibit_overlap = "${prohibit_overlap}" - time_zone = "${time_zone}" - } - - # The "group" stanza defines a series of tasks that should be co-located on - # the same Nomad client. Any task within a group will be placed on the same - # client. - # - # https://www.nomadproject.io/docs/job-specification/group - # - group "${job_name}-master" { - # The restart stanza configures a tasks's behavior on task failure. Restarts - # happen on the client that is running the task. - # - # https://www.nomadproject.io/docs/job-specification/restart - # - restart { - mode = "fail" - } - - # The constraint allows restricting the set of eligible nodes. Constraints - # may filter on attributes or client metadata. - # - # For more information and examples on the "volume" stanza, please see - # the online documentation at: - # - # https://www.nomadproject.io/docs/job-specification/constraint - # - constraint { - attribute = "$${attr.cpu.arch}" - operator = "!=" - value = "arm64" - } - - constraint { - attribute = "$${node.class}" - value = "builder" - } - - # The "task" stanza creates an individual unit of work, such as a Docker - # container, web application, or batch processing. - # - # https://www.nomadproject.io/docs/job-specification/task.html - # - task "${job_name}-trending" { - # The artifact stanza instructs Nomad to fetch and unpack a remote - # resource, such as a file, tarball, or binary. - # - # https://www.nomadproject.io/docs/job-specification/artifact - # - artifact { - source = "git::https://github.com/FDio/csit" - destination = "local/csit" - } - - # The "driver" parameter specifies the task driver that should be used to - # run the task. - driver = "docker" - - # The "config" stanza specifies the driver configuration, which is passed - # directly to the driver to start the task. The details of configurations - # are specific to each driver, so please see specific driver - # documentation for more information. - config { - image = "${image}" - command = "gluesparksubmit" - args = [ - "--driver-memory", "20g", - "--executor-memory", "20g", - "trending.py" - ] - work_dir = "/local/csit/csit.infra.etl" - } - - # The env stanza configures a list of environment variables to populate - # the task's environment before starting. - env { - AWS_ACCESS_KEY_ID = "${aws_access_key_id}" - AWS_SECRET_ACCESS_KEY = "${aws_secret_access_key}" - AWS_DEFAULT_REGION = "${aws_default_region}" - OUT_AWS_ACCESS_KEY_ID = "${out_aws_access_key_id}" - OUT_AWS_SECRET_ACCESS_KEY = "${out_aws_secret_access_key}" - OUT_AWS_DEFAULT_REGION = "${out_aws_default_region}" - ${ envs } - } - - # The "resources" stanza describes the requirements a task needs to - # execute. Resource requirements include memory, network, cpu, and more. - # This ensures the task will execute on a machine that contains enough - # resource capacity. - # - # https://www.nomadproject.io/docs/job-specification/resources - # - resources { - cpu = ${cpu} - memory = ${memory} - } - } - task "${job_name}-stats" { - # The artifact stanza instructs Nomad to fetch and unpack a remote - # resource, such as a file, tarball, or binary. - # - # https://www.nomadproject.io/docs/job-specification/artifact - # - artifact { - source = "git::https://github.com/FDio/csit" - destination = "local/csit" - } - - # The "driver" parameter specifies the task driver that should be used to - # run the task. - driver = "docker" - - # The "config" stanza specifies the driver configuration, which is passed - # directly to the driver to start the task. The details of configurations - # are specific to each driver, so please see specific driver - # documentation for more information. - config { - image = "${image}" - command = "gluesparksubmit" - args = [ - "--driver-memory", "10g", - "--executor-memory", "10g", - "stats.py" - ] - work_dir = "/local/csit/csit.infra.etl" - } - - # The env stanza configures a list of environment variables to populate - # the task's environment before starting. - env { - AWS_ACCESS_KEY_ID = "${aws_access_key_id}" - AWS_SECRET_ACCESS_KEY = "${aws_secret_access_key}" - AWS_DEFAULT_REGION = "${aws_default_region}" - OUT_AWS_ACCESS_KEY_ID = "${out_aws_access_key_id}" - OUT_AWS_SECRET_ACCESS_KEY = "${out_aws_secret_access_key}" - OUT_AWS_DEFAULT_REGION = "${out_aws_default_region}" - ${ envs } - } - - # The "resources" stanza describes the requirements a task needs to - # execute. Resource requirements include memory, network, cpu, and more. - # This ensures the task will execute on a machine that contains enough - # resource capacity. - # - # https://www.nomadproject.io/docs/job-specification/resources - # - resources { - cpu = ${cpu} - memory = ${memory} - } - } - } - group "${job_name}-rls2202" { - # The restart stanza configures a tasks's behavior on task failure. Restarts - # happen on the client that is running the task. - # - # https://www.nomadproject.io/docs/job-specification/restart - # - restart { - mode = "fail" - } - - # The constraint allows restricting the set of eligible nodes. Constraints - # may filter on attributes or client metadata. - # - # For more information and examples on the "volume" stanza, please see - # the online documentation at: - # - # https://www.nomadproject.io/docs/job-specification/constraint - # - constraint { - attribute = "$${attr.cpu.arch}" - operator = "!=" - value = "arm64" - } - - constraint { - attribute = "$${node.class}" - value = "builder" - } - - # The "task" stanza creates an individual unit of work, such as a Docker - # container, web application, or batch processing. - # - # https://www.nomadproject.io/docs/job-specification/task.html - # - task "${job_name}-coverage" { - # The artifact stanza instructs Nomad to fetch and unpack a remote - # resource, such as a file, tarball, or binary. - # - # https://www.nomadproject.io/docs/job-specification/artifact - # - artifact { - source = "git::https://github.com/FDio/csit" - destination = "local/csit" - } - - # The "driver" parameter specifies the task driver that should be used to - # run the task. - driver = "docker" - - # The "config" stanza specifies the driver configuration, which is passed - # directly to the driver to start the task. The details of configurations - # are specific to each driver, so please see specific driver - # documentation for more information. - config { - image = "${image}" - command = "gluesparksubmit" - args = [ - "--driver-memory", "20g", - "--executor-memory", "20g", - "coverage_rls2202.py" - ] - work_dir = "/local/csit/csit.infra.etl" - } - - # The env stanza configures a list of environment variables to populate - # the task's environment before starting. - env { - AWS_ACCESS_KEY_ID = "${aws_access_key_id}" - AWS_SECRET_ACCESS_KEY = "${aws_secret_access_key}" - AWS_DEFAULT_REGION = "${aws_default_region}" - OUT_AWS_ACCESS_KEY_ID = "${out_aws_access_key_id}" - OUT_AWS_SECRET_ACCESS_KEY = "${out_aws_secret_access_key}" - OUT_AWS_DEFAULT_REGION = "${out_aws_default_region}" - ${ envs } - } - - # The "resources" stanza describes the requirements a task needs to - # execute. Resource requirements include memory, network, cpu, and more. - # This ensures the task will execute on a machine that contains enough - # resource capacity. - # - # https://www.nomadproject.io/docs/job-specification/resources - # - resources { - cpu = ${cpu} - memory = ${memory} - } - } - task "${job_name}-iterative" { - # The artifact stanza instructs Nomad to fetch and unpack a remote - # resource, such as a file, tarball, or binary. - # - # https://www.nomadproject.io/docs/job-specification/artifact - # - artifact { - source = "git::https://github.com/FDio/csit" - destination = "local/csit" - } - - # The "driver" parameter specifies the task driver that should be used to - # run the task. - driver = "docker" - - # The "config" stanza specifies the driver configuration, which is passed - # directly to the driver to start the task. The details of configurations - # are specific to each driver, so please see specific driver - # documentation for more information. - config { - image = "${image}" - command = "gluesparksubmit" - args = [ - "--driver-memory", "20g", - "--executor-memory", "20g", - "iterative_rls2202.py" - ] - work_dir = "/local/csit/csit.infra.etl" - } - - # The env stanza configures a list of environment variables to populate - # the task's environment before starting. - env { - AWS_ACCESS_KEY_ID = "${aws_access_key_id}" - AWS_SECRET_ACCESS_KEY = "${aws_secret_access_key}" - AWS_DEFAULT_REGION = "${aws_default_region}" - OUT_AWS_ACCESS_KEY_ID = "${out_aws_access_key_id}" - OUT_AWS_SECRET_ACCESS_KEY = "${out_aws_secret_access_key}" - OUT_AWS_DEFAULT_REGION = "${out_aws_default_region}" - ${ envs } - } - - # The "resources" stanza describes the requirements a task needs to - # execute. Resource requirements include memory, network, cpu, and more. - # This ensures the task will execute on a machine that contains enough - # resource capacity. - # - # https://www.nomadproject.io/docs/job-specification/resources - # - resources { - cpu = ${cpu} - memory = ${memory} - } - } - } -} diff --git a/fdio.infra.terraform/1n_nmd/etl/fdio/main.tf b/fdio.infra.terraform/1n_nmd/etl/fdio/main.tf deleted file mode 100644 index 3d2026f0f9..0000000000 --- a/fdio.infra.terraform/1n_nmd/etl/fdio/main.tf +++ /dev/null @@ -1,23 +0,0 @@ -data "vault_generic_secret" "fdio_logs" { - path = "kv/secret/data/etl/fdio_logs" -} - -data "vault_generic_secret" "fdio_docs" { - path = "kv/secret/data/etl/fdio_docs" -} - -module "etl" { - providers = { - nomad = nomad.yul1 - } - source = "../" - - aws_access_key_id = data.vault_generic_secret.fdio_logs.data["access_key"] - aws_secret_access_key = data.vault_generic_secret.fdio_logs.data["secret_key"] - aws_default_region = data.vault_generic_secret.fdio_logs.data["region"] - out_aws_access_key_id = data.vault_generic_secret.fdio_docs.data["access_key"] - out_aws_secret_access_key = data.vault_generic_secret.fdio_docs.data["secret_key"] - out_aws_default_region = data.vault_generic_secret.fdio_docs.data["region"] - cron = "@daily" - datacenters = ["yul1"] -} diff --git a/fdio.infra.terraform/1n_nmd/etl/fdio/providers.tf b/fdio.infra.terraform/1n_nmd/etl/fdio/providers.tf deleted file mode 100644 index c6617da02b..0000000000 --- a/fdio.infra.terraform/1n_nmd/etl/fdio/providers.tf +++ /dev/null @@ -1,13 +0,0 @@ -provider "nomad" { - address = var.nomad_provider_address - alias = "yul1" - # ca_file = var.nomad_provider_ca_file - # cert_file = var.nomad_provider_cert_file - # key_file = var.nomad_provider_key_file -} - -provider "vault" { - address = var.vault_provider_address - skip_tls_verify = var.vault_provider_skip_tls_verify - token = var.vault_provider_token -} diff --git a/fdio.infra.terraform/1n_nmd/etl/fdio/variables.tf b/fdio.infra.terraform/1n_nmd/etl/fdio/variables.tf deleted file mode 100644 index 0e0b3af622..0000000000 --- a/fdio.infra.terraform/1n_nmd/etl/fdio/variables.tf +++ /dev/null @@ -1,47 +0,0 @@ -variable "nomad_acl" { - description = "Nomad ACLs enabled/disabled." - type = bool - default = false -} - -variable "nomad_provider_address" { - description = "FD.io Nomad cluster address." - type = string - default = "http://10.32.8.14:4646" -} - -variable "nomad_provider_ca_file" { - description = "A local file path to a PEM-encoded certificate authority." - type = string - default = "/etc/nomad.d/ssl/nomad-ca.pem" -} - -variable "nomad_provider_cert_file" { - description = "A local file path to a PEM-encoded certificate." - type = string - default = "/etc/nomad.d/ssl/nomad-cli.pem" -} - -variable "nomad_provider_key_file" { - description = "A local file path to a PEM-encoded private key." - type = string - default = "/etc/nomad.d/ssl/nomad-cli-key.pem" -} - -variable "vault_provider_address" { - description = "Vault cluster address." - type = string - default = "http://10.30.51.28:8200" -} - -variable "vault_provider_skip_tls_verify" { - description = "Verification of the Vault server's TLS certificate." - type = bool - default = false -} - -variable "vault_provider_token" { - description = "Vault root token." - type = string - sensitive = true -} diff --git a/fdio.infra.terraform/1n_nmd/etl/fdio/versions.tf b/fdio.infra.terraform/1n_nmd/etl/fdio/versions.tf deleted file mode 100644 index 526e1d0df0..0000000000 --- a/fdio.infra.terraform/1n_nmd/etl/fdio/versions.tf +++ /dev/null @@ -1,17 +0,0 @@ -terraform { - backend "consul" { - address = "10.32.8.14:8500" - scheme = "http" - path = "terraform/etl" - } - required_providers { - nomad = { - source = "hashicorp/nomad" - version = ">= 1.4.16" - } - vault = { - version = ">= 3.2.1" - } - } - required_version = ">= 1.1.4" -} diff --git a/fdio.infra.terraform/1n_nmd/etl/main.tf b/fdio.infra.terraform/1n_nmd/etl/main.tf deleted file mode 100644 index c477da81a8..0000000000 --- a/fdio.infra.terraform/1n_nmd/etl/main.tf +++ /dev/null @@ -1,33 +0,0 @@ -locals { - datacenters = join(",", var.datacenters) - envs = join("\n", concat([], var.envs)) -} - -resource "nomad_job" "nomad_job_etl" { - jobspec = templatefile( - "${path.module}/conf/nomad/etl.hcl.tftpl", - { - aws_access_key_id = var.aws_access_key_id, - aws_secret_access_key = var.aws_secret_access_key, - aws_default_region = var.aws_default_region - cpu = var.cpu, - cron = var.cron, - datacenters = local.datacenters, - envs = local.envs, - image = var.image, - job_name = var.job_name, - memory = var.memory, - out_aws_access_key_id = var.out_aws_access_key_id, - out_aws_secret_access_key = var.out_aws_secret_access_key, - out_aws_default_region = var.out_aws_default_region - prohibit_overlap = var.prohibit_overlap, - time_zone = var.time_zone, - type = var.type, - use_vault_provider = var.vault_secret.use_vault_provider, - vault_kv_policy_name = var.vault_secret.vault_kv_policy_name, - vault_kv_path = var.vault_secret.vault_kv_path, - vault_kv_field_access_key = var.vault_secret.vault_kv_field_access_key, - vault_kv_field_secret_key = var.vault_secret.vault_kv_field_secret_key - }) - detach = false -} diff --git a/fdio.infra.terraform/1n_nmd/etl/variables.tf b/fdio.infra.terraform/1n_nmd/etl/variables.tf deleted file mode 100644 index 3c6c12a943..0000000000 --- a/fdio.infra.terraform/1n_nmd/etl/variables.tf +++ /dev/null @@ -1,115 +0,0 @@ -# Nomad -variable "datacenters" { - description = "Specifies the list of DCs to be considered placing this task." - type = list(string) - default = ["dc1"] -} - -# ETL -variable "aws_access_key_id" { - description = "AWS access key." - type = string - default = "aws" -} - -variable "aws_secret_access_key" { - description = "AWS secret key" - type = string - default = "aws" -} - -variable "aws_default_region" { - description = "AWS region" - type = string - default = "aws" -} - -variable "cpu" { - description = "Specifies the CPU required to run this task in MHz." - type = number - default = 10000 -} - -variable "cron" { - description = "Specifies a cron expression configuring the interval to launch." - type = string - default = "@daily" -} - -variable "envs" { - description = "Specifies ETL environment variables." - type = list(string) - default = [] -} - -variable "image" { - description = "Specifies the Docker image to run." - type = string - default = "pmikus/docker-ubuntu-focal-aws-glue:latest" -} - -variable "job_name" { - description = "Specifies a name for the job." - type = string - default = "etl" -} - -variable "memory" { - description = "Specifies the memory required in MB." - type = number - default = 20000 -} - -variable "out_aws_access_key_id" { - description = "AWS access key." - type = string - default = "aws" -} - -variable "out_aws_secret_access_key" { - description = "AWS secret key" - type = string - default = "aws" -} - -variable "out_aws_default_region" { - description = "AWS region" - type = string - default = "aws" -} - -variable "prohibit_overlap" { - description = "Specifies if this job should wait until previous completed." - type = bool - default = true -} - -variable "time_zone" { - description = "Specifies the time zone to evaluate the next launch interval." - type = string - default = "UTC" -} - -variable "type" { - description = "Specifies the Nomad scheduler to use." - type = string - default = "batch" -} - -variable "vault_secret" { - type = object({ - use_vault_provider = bool, - vault_kv_policy_name = string, - vault_kv_path = string, - vault_kv_field_access_key = string, - vault_kv_field_secret_key = string - }) - description = "Set of properties to be able to fetch secret from vault." - default = { - use_vault_provider = false - vault_kv_policy_name = "kv" - vault_kv_path = "secret/data/etl" - vault_kv_field_access_key = "access_key" - vault_kv_field_secret_key = "secret_key" - } -} diff --git a/fdio.infra.terraform/1n_nmd/etl/versions.tf b/fdio.infra.terraform/1n_nmd/etl/versions.tf deleted file mode 100644 index a01708f28a..0000000000 --- a/fdio.infra.terraform/1n_nmd/etl/versions.tf +++ /dev/null @@ -1,9 +0,0 @@ -terraform { - required_providers { - nomad = { - source = "hashicorp/nomad" - version = ">= 1.4.16" - } - } - required_version = ">= 1.1.4" -} diff --git a/fdio.infra.terraform/1n_nmd/main.tf b/fdio.infra.terraform/1n_nmd/main.tf index 7cdd245b7a..24d5ff3efc 100644 --- a/fdio.infra.terraform/1n_nmd/main.tf +++ b/fdio.infra.terraform/1n_nmd/main.tf @@ -4,31 +4,6 @@ # and downstream modules can simply declare resources for that provider # and have them automatically associated with the root provider # configurations. -module "grafana" { - source = "./grafana" - providers = { - nomad = nomad.yul1 - } - - # nomad - nomad_datacenters = ["yul1"] - - # grafana - grafana_job_name = "prod-grafana" - grafana_use_canary = true - grafana_group_count = 1 - grafana_vault_secret = { - use_vault_provider = false, - vault_kv_policy_name = "kv-secret", - vault_kv_path = "secret/data/grafana", - vault_kv_field_access_key = "access_key", - vault_kv_field_secret_key = "secret_key" - } - grafana_container_image = "grafana/grafana:7.3.7" - grafana_cpu = 1000 - grafana_mem = 2048 - grafana_port = 3000 -} #module "minio" { # source = "./minio" @@ -66,41 +41,6 @@ data "vault_generic_secret" "minio_creds" { path = "kv/secret/data/minio" } -module "minio_s3_gateway" { - source = "./minio_s3_gateway" - providers = { - nomad = nomad.yul1 - } - - # nomad - datacenters = ["yul1"] - volume_source = "prod-volume-data1-1" - - # minio - job_name = "minio-s3-gateway" - group_count = 4 - service_name = "minio" - mode = "gateway" - port_base = 9001 - port_console = 9002 - image = "minio/minio:latest" - access_key = data.vault_generic_secret.minio_creds.data["access_key"] - secret_key = data.vault_generic_secret.minio_creds.data["secret_key"] - volume_destination = "/data/" - use_host_volume = true - use_canary = true - envs = [ - "MINIO_BROWSER=\"off\"", - "MINIO_CACHE=\"on\"", - "MINIO_CACHE_DRIVES=\"/data/s3_cache1\"", - "MINIO_CACHE_EXCLUDE=\"\"", - "MINIO_CACHE_QUOTA=80", - "MINIO_CACHE_AFTER=1", - "MINIO_CACHE_WATERMARK_LOW=70", - "MINIO_CACHE_WATERMARK_HIGH=90" - ] -} - #module "nginx" { # source = "./nginx" # providers = { @@ -116,35 +56,6 @@ module "minio_s3_gateway" { # nginx_use_host_volume = true #} -module "prometheus" { - source = "./prometheus" - providers = { - nomad = nomad.yul1 - } - - # nomad - nomad_datacenters = ["yul1"] - nomad_host_volume = "prod-volume-data1-1" - - # prometheus - prometheus_job_name = "prod-prometheus" - prometheus_use_canary = true - prometheus_group_count = 4 - prometheus_vault_secret = { - use_vault_provider = false, - vault_kv_policy_name = "kv-secret", - vault_kv_path = "secret/data/prometheus", - vault_kv_field_access_key = "access_key", - vault_kv_field_secret_key = "secret_key" - } - prometheus_data_dir = "/data/" - prometheus_use_host_volume = true - prometheus_version = "2.28.1" - prometheus_cpu = 2000 - prometheus_mem = 8192 - prometheus_port = 9090 -} - module "vpp_device" { source = "./vpp_device" providers = { diff --git a/fdio.infra.terraform/1n_nmd/minio_s3_gateway/conf/nomad/minio.hcl b/fdio.infra.terraform/1n_nmd/minio_s3_gateway/conf/nomad/minio.hcl deleted file mode 100644 index 6210040b0c..0000000000 --- a/fdio.infra.terraform/1n_nmd/minio_s3_gateway/conf/nomad/minio.hcl +++ /dev/null @@ -1,246 +0,0 @@ -job "${job_name}" { - # The "region" parameter specifies the region in which to execute the job. - # If omitted, this inherits the default region name of "global". - # region = "${region}" - - # The "datacenters" parameter specifies the list of datacenters which should - # be considered when placing this task. This must be provided. - datacenters = "${datacenters}" - - # The "type" parameter controls the type of job, which impacts the scheduler's - # decision on placement. This configuration is optional and defaults to - # "service". For a full list of job types and their differences, please see - # the online documentation. - # - # https://www.nomadproject.io/docs/jobspec/schedulers - # - type = "service" - - update { - # The "max_parallel" parameter specifies the maximum number of updates to - # perform in parallel. In this case, this specifies to update a single task - # at a time. - max_parallel = ${max_parallel} - - health_check = "checks" - - # The "min_healthy_time" parameter specifies the minimum time the allocation - # must be in the healthy state before it is marked as healthy and unblocks - # further allocations from being updated. - min_healthy_time = "10s" - - # The "healthy_deadline" parameter specifies the deadline in which the - # allocation must be marked as healthy after which the allocation is - # automatically transitioned to unhealthy. Transitioning to unhealthy will - # fail the deployment and potentially roll back the job if "auto_revert" is - # set to true. - healthy_deadline = "3m" - - # The "progress_deadline" parameter specifies the deadline in which an - # allocation must be marked as healthy. The deadline begins when the first - # allocation for the deployment is created and is reset whenever an allocation - # as part of the deployment transitions to a healthy state. If no allocation - # transitions to the healthy state before the progress deadline, the - # deployment is marked as failed. - progress_deadline = "10m" - -%{ if use_canary } - # The "canary" parameter specifies that changes to the job that would result - # in destructive updates should create the specified number of canaries - # without stopping any previous allocations. Once the operator determines the - # canaries are healthy, they can be promoted which unblocks a rolling update - # of the remaining allocations at a rate of "max_parallel". - # - # Further, setting "canary" equal to the count of the task group allows - # blue/green deployments. When the job is updated, a full set of the new - # version is deployed and upon promotion the old version is stopped. - canary = ${canary} - - # Specifies if the job should auto-promote to the canary version when all - # canaries become healthy during a deployment. Defaults to false which means - # canaries must be manually updated with the nomad deployment promote - # command. - auto_promote = ${auto_promote} - - # The "auto_revert" parameter specifies if the job should auto-revert to the - # last stable job on deployment failure. A job is marked as stable if all the - # allocations as part of its deployment were marked healthy. - auto_revert = ${auto_revert} -%{ endif } - } - - # All groups in this job should be scheduled on different hosts. - constraint { - operator = "distinct_hosts" - value = "true" - } - - # The "group" stanza defines a series of tasks that should be co-located on - # the same Nomad client. Any task within a group will be placed on the same - # client. - # - # https://www.nomadproject.io/docs/job-specification/group - # - group "${job_name}-group-1" { - # The "count" parameter specifies the number of the task groups that should - # be running under this group. This value must be non-negative and defaults - # to 1. - count = ${group_count} - - # The volume stanza allows the group to specify that it requires a given - # volume from the cluster. The key of the stanza is the name of the volume - # as it will be exposed to task configuration. - # - # https://www.nomadproject.io/docs/job-specification/volume - %{ if use_host_volume } - volume "${job_name}-volume-1" { - type = "host" - read_only = false - source = "${volume_source}" - } - %{ endif } - - # The restart stanza configures a tasks's behavior on task failure. Restarts - # happen on the client that is running the task. - # - # https://www.nomadproject.io/docs/job-specification/restart - # - restart { - interval = "30m" - attempts = 40 - delay = "15s" - mode = "delay" - } - - # The network stanza specifies the networking requirements for the task - # group, including the network mode and port allocations. When scheduling - # jobs in Nomad they are provisioned across your fleet of machines along - # with other jobs and services. Because you don't know in advance what host - # your job will be provisioned on, Nomad will provide your tasks with - # network configuration when they start up. - # - # https://www.nomadproject.io/docs/job-specification/network - # - network { - port "base" { - static = ${port_base} - to = ${port_base} - } - port "console" { - static = ${port_console} - to = ${port_console} - } - } - - # The "task" stanza creates an individual unit of work, such as a Docker - # container, web application, or batch processing. - # - # https://www.nomadproject.io/docs/job-specification/task.html - # - task "${job_name}-task-1" { - # The "driver" parameter specifies the task driver that should be used to - # run the task. - driver = "exec" - - %{ if use_host_volume } - volume_mount { - volume = "${job_name}-volume-1" - destination = "${volume_destination}" - read_only = false - } - %{ endif } - - %{ if use_vault_provider } - vault { - policies = "${vault_kv_policy_name}" - } - %{ endif } - - # The "config" stanza specifies the driver configuration, which is passed - # directly to the driver to start the task. The details of configurations - # are specific to each driver, so please see specific driver - # documentation for more information. - config { - args = [ - "${mode}", "s3", - "-address", ":${port_base}", - "-console-address", ":${port_console}" - ] - command = "local/minio" - } - - # The artifact stanza instructs Nomad to fetch and unpack a remote resource, - # such as a file, tarball, or binary. Nomad downloads artifacts using the - # popular go-getter library, which permits downloading artifacts from a - # variety of locations using a URL as the input source. - # - # For more information and examples on the "artifact" stanza, please see - # the online documentation at: - # - # https://www.nomadproject.io/docs/job-specification/artifact - # - artifact { - source = "https://dl.min.io/server/minio/release/linux-amd64/minio" - } - - # The env stanza configures a list of environment variables to populate - # the task's environment before starting. - env { -%{ if use_vault_provider } -{{ with secret "${vault_kv_path}" }} - MINIO_ROOT_USER = "{{ .Data.data.${vault_kv_field_access_key} }}" - MINIO_ROOT_PASSWORD = "{{ .Data.data.${vault_kv_field_secret_key} }}" -{{ end }} -%{ else } - MINIO_ROOT_USER = "${access_key}" - MINIO_ROOT_PASSWORD = "${secret_key}" - AWS_ACCESS_KEY_ID = "${access_key}" - AWS_SECRET_ACCESS_KEY = "${secret_key}" -%{ endif } - ${ envs } - } - - # The service stanza instructs Nomad to register a service with Consul. - # - # https://www.nomadproject.io/docs/job-specification/service - # - service { - name = "${service_name}" - port = "base" - tags = [ "${service_name}$${NOMAD_ALLOC_INDEX}" ] - check { - name = "Min.io Server HTTP Check Live" - type = "http" - port = "base" - protocol = "http" - method = "GET" - path = "/minio/health/live" - interval = "10s" - timeout = "2s" - } - check { - name = "Min.io Server HTTP Check Ready" - type = "http" - port = "base" - protocol = "http" - method = "GET" - path = "/minio/health/ready" - interval = "10s" - timeout = "2s" - } - } - - # The "resources" stanza describes the requirements a task needs to - # execute. Resource requirements include memory, network, cpu, and more. - # This ensures the task will execute on a machine that contains enough - # resource capacity. - # - # https://www.nomadproject.io/docs/job-specification/resources - # - resources { - cpu = ${cpu} - memory = ${memory} - } - } - } -} diff --git a/fdio.infra.terraform/1n_nmd/minio_s3_gateway/main.tf b/fdio.infra.terraform/1n_nmd/minio_s3_gateway/main.tf deleted file mode 100644 index 2ae3cac9c2..0000000000 --- a/fdio.infra.terraform/1n_nmd/minio_s3_gateway/main.tf +++ /dev/null @@ -1,51 +0,0 @@ -locals { - datacenters = join(",", var.datacenters) - envs = join("\n", concat([], var.envs)) - upstreams = jsonencode(var.upstreams) -} - -data "template_file" "nomad_job_minio" { - template = file("${path.module}/conf/nomad/minio.hcl") - vars = { - access_key = var.access_key - auto_promote = var.auto_promote - auto_revert = var.auto_revert - canary = var.canary - cpu = var.cpu - cpu_proxy = var.resource_proxy.cpu - datacenters = local.datacenters - envs = local.envs - group_count = var.group_count - host = var.host - image = var.image - job_name = var.job_name - max_parallel = var.max_parallel - memory = var.memory - memory_proxy = var.resource_proxy.memory - mode = var.mode - port_base = var.port_base - port_console = var.port_console - region = var.region - secret_key = var.secret_key - service_name = var.service_name - use_canary = var.use_canary - use_host_volume = var.use_host_volume - upstreams = local.upstreams - use_vault_kms = var.kms_variables.use_vault_kms - use_vault_provider = var.vault_secret.use_vault_provider - vault_address = var.kms_variables.vault_address - vault_kms_approle_kv = var.kms_variables.vault_kms_approle_kv - vault_kms_key_name = var.kms_variables.vault_kms_key_name - vault_kv_policy_name = var.vault_secret.vault_kv_policy_name - vault_kv_path = var.vault_secret.vault_kv_path - vault_kv_field_access_key = var.vault_secret.vault_kv_field_access_key - vault_kv_field_secret_key = var.vault_secret.vault_kv_field_secret_key - volume_destination = var.volume_destination - volume_source = var.volume_source - } -} - -resource "nomad_job" "nomad_job_minio" { - jobspec = data.template_file.nomad_job_minio.rendered - detach = false -} diff --git a/fdio.infra.terraform/1n_nmd/minio_s3_gateway/variables.tf b/fdio.infra.terraform/1n_nmd/minio_s3_gateway/variables.tf deleted file mode 100644 index 6fb351df26..0000000000 --- a/fdio.infra.terraform/1n_nmd/minio_s3_gateway/variables.tf +++ /dev/null @@ -1,199 +0,0 @@ -# Nomad - -variable "datacenters" { - description = "Specifies the list of DCs to be considered placing this task" - type = list(string) - default = ["dc1"] -} - -variable "region" { - description = "Specifies the list of DCs to be considered placing this task" - type = string - default = "global" -} - -variable "volume_source" { - description = "The name of the volume to request" - type = string - default = "persistence" -} - -# Minio -variable "access_key" { - description = "Minio access key" - type = string - default = "minio" -} - -variable "auto_promote" { - description = "Specifies if the job should auto-promote to the canary version" - type = bool - default = true -} - -variable "auto_revert" { - description = "Specifies if the job should auto-revert to the last stable job" - type = bool - default = true -} - -variable "canary" { - description = "Equal to the count of the task group allows blue/green depl." - type = number - default = 1 -} - -variable "cpu" { - description = "Specifies the CPU required to run this task in MHz" - type = number - default = 1000 -} - -variable "envs" { - description = "Minio environment variables" - type = list(string) - default = [] -} - -variable "group_count" { - description = "Specifies the number of the task groups running under this one" - type = number - default = 1 -} - -variable "host" { - description = "Minio host" - type = string - default = "127.0.0.1" -} - -variable "image" { - description = "The Docker image to run" - type = string - default = "minio/minio:latest" -} - -variable "job_name" { - description = "Specifies a name for the job" - type = string - default = "minio" -} - -variable "kms_variables" { - type = object({ - use_vault_kms = string - vault_address = string, - vault_kms_approle_kv = string, - vault_kms_key_name = string - }) - description = "Set of properties to be able to transit secrets in vault" - default = { - use_vault_kms = false - vault_address = "", - vault_kms_approle_kv = "", - vault_kms_key_name = "" - } -} - -variable "max_parallel" { - description = "Specifies the maximum number of updates to perform in parallel" - type = number - default = 1 -} - -variable "memory" { - description = "Specifies the memory required in MB" - type = number - default = 1024 -} - -variable "mode" { - description = "Specifies the Minio mode" - type = string - default = "server" -} - -variable "port_base" { - description = "Specifies the static TCP/UDP port to allocate" - type = number - default = 9000 -} - -variable "port_console" { - description = "Specifies the static TCP/UDP port to allocate" - type = number - default = 9001 -} - -variable "resource_proxy" { - description = "Minio proxy resources" - type = object({ - cpu = number, - memory = number - }) - default = { - cpu = 2000, - memory = 1024 - } - validation { - condition = var.resource_proxy.cpu >= 200 && var.resource_proxy.memory >= 128 - error_message = "Proxy resource must be at least: cpu=200, memory=128." - } -} - -variable "service_name" { - description = "Specifies the name this service will be advertised in Consul" - type = string - default = "minio" -} - -variable "secret_key" { - description = "Minio secret key" - type = string - default = "minio" -} - -variable "upstreams" { - type = list(object({ - service_name = string, - port = number, - })) - description = "List of upstream services" - default = [] -} - -variable "use_canary" { - description = "Uses canary deployment for Minio" - type = bool - default = false -} - -variable "use_host_volume" { - description = "Use Nomad host volume feature" - type = bool - default = false -} - -variable "vault_secret" { - type = object({ - use_vault_provider = bool, - vault_kv_policy_name = string, - vault_kv_path = string, - vault_kv_field_access_key = string, - vault_kv_field_secret_key = string - }) - description = "Set of properties to be able to fetch secret from vault" - default = { - use_vault_provider = false - vault_kv_policy_name = "kv" - vault_kv_path = "secret/data/minio" - vault_kv_field_access_key = "access_key" - vault_kv_field_secret_key = "secret_key" - } -} - -variable "volume_destination" { - description = "Specifies where the volume should be mounted inside the task" - type = string - default = "/data/" -} diff --git a/fdio.infra.terraform/1n_nmd/minio_s3_gateway/versions.tf b/fdio.infra.terraform/1n_nmd/minio_s3_gateway/versions.tf deleted file mode 100644 index b80610a525..0000000000 --- a/fdio.infra.terraform/1n_nmd/minio_s3_gateway/versions.tf +++ /dev/null @@ -1,13 +0,0 @@ -terraform { - required_providers { - nomad = { - source = "hashicorp/nomad" - version = "~> 1.4.15" - } - template = { - source = "hashicorp/template" - version = "~> 2.2.0" - } - } - required_version = ">= 1.0.3" -} diff --git a/fdio.infra.terraform/1n_nmd/prometheus/conf/nomad/prometheus.hcl.tftpl b/fdio.infra.terraform/1n_nmd/prometheus/conf/nomad/prometheus.hcl.tftpl deleted file mode 100644 index 224f7e5e00..0000000000 --- a/fdio.infra.terraform/1n_nmd/prometheus/conf/nomad/prometheus.hcl.tftpl +++ /dev/null @@ -1,624 +0,0 @@ -job "${job_name}" { - # The "region" parameter specifies the region in which to execute the job. - # If omitted, this inherits the default region name of "global". - # region = "${region}" - - # The "datacenters" parameter specifies the list of datacenters which should - # be considered when placing this task. This must be provided. - datacenters = "${datacenters}" - - # The "type" parameter controls the type of job, which impacts the scheduler's - # decision on placement. This configuration is optional and defaults to - # "service". For a full list of job types and their differences, please see - # the online documentation. - # - # https://www.nomadproject.io/docs/jobspec/schedulers - # - type = "service" - - update { - # The "max_parallel" parameter specifies the maximum number of updates to - # perform in parallel. In this case, this specifies to update a single task - # at a time. - max_parallel = ${max_parallel} - - health_check = "checks" - - # The "min_healthy_time" parameter specifies the minimum time the allocation - # must be in the healthy state before it is marked as healthy and unblocks - # further allocations from being updated. - min_healthy_time = "10s" - - # The "healthy_deadline" parameter specifies the deadline in which the - # allocation must be marked as healthy after which the allocation is - # automatically transitioned to unhealthy. Transitioning to unhealthy will - # fail the deployment and potentially roll back the job if "auto_revert" is - # set to true. - healthy_deadline = "3m" - - # The "progress_deadline" parameter specifies the deadline in which an - # allocation must be marked as healthy. The deadline begins when the first - # allocation for the deployment is created and is reset whenever an allocation - # as part of the deployment transitions to a healthy state. If no allocation - # transitions to the healthy state before the progress deadline, the - # deployment is marked as failed. - progress_deadline = "10m" - -%{ if use_canary } - # The "canary" parameter specifies that changes to the job that would result - # in destructive updates should create the specified number of canaries - # without stopping any previous allocations. Once the operator determines the - # canaries are healthy, they can be promoted which unblocks a rolling update - # of the remaining allocations at a rate of "max_parallel". - # - # Further, setting "canary" equal to the count of the task group allows - # blue/green deployments. When the job is updated, a full set of the new - # version is deployed and upon promotion the old version is stopped. - canary = ${canary} - - # Specifies if the job should auto-promote to the canary version when all - # canaries become healthy during a deployment. Defaults to false which means - # canaries must be manually updated with the nomad deployment promote - # command. - auto_promote = ${auto_promote} - - # The "auto_revert" parameter specifies if the job should auto-revert to the - # last stable job on deployment failure. A job is marked as stable if all the - # allocations as part of its deployment were marked healthy. - auto_revert = ${auto_revert} -%{ endif } - } - - # The "group" stanza defines a series of tasks that should be co-located on - # the same Nomad client. Any task within a group will be placed on the same - # client. - # - # https://www.nomadproject.io/docs/job-specification/group - # - group "${job_name}-group-1" { - # The "count" parameter specifies the number of the task groups that should - # be running under this group. This value must be non-negative and defaults - # to 1. - count = ${group_count} - - # The volume stanza allows the group to specify that it requires a given - # volume from the cluster. The key of the stanza is the name of the volume - # as it will be exposed to task configuration. - # - # https://www.nomadproject.io/docs/job-specification/volume - %{ if use_host_volume } - volume "${job_name}-volume-1" { - type = "host" - read_only = false - source = "${volume_source}" - } - %{ endif } - - # The restart stanza configures a tasks's behavior on task failure. Restarts - # happen on the client that is running the task. - # - # https://www.nomadproject.io/docs/job-specification/restart - # - restart { - interval = "30m" - attempts = 40 - delay = "15s" - mode = "delay" - } - - # The constraint allows restricting the set of eligible nodes. Constraints - # may filter on attributes or client metadata. - # - # https://www.nomadproject.io/docs/job-specification/constraint - # - constraint { - attribute = "$${attr.cpu.arch}" - operator = "!=" - value = "arm64" - } - - constraint { - attribute = "$${node.class}" - value = "builder" - } - - # The network stanza specifies the networking requirements for the task - # group, including the network mode and port allocations. When scheduling - # jobs in Nomad they are provisioned across your fleet of machines along - # with other jobs and services. Because you don't know in advance what host - # your job will be provisioned on, Nomad will provide your tasks with - # network configuration when they start up. - # - # https://www.nomadproject.io/docs/job-specification/network - # - network { - port "${service_name}" { - static = ${port} - to = ${port} - } - } - - # The "task" stanza creates an individual unit of work, such as a Docker - # container, web application, or batch processing. - # - # https://www.nomadproject.io/docs/job-specification/task - # - task "${job_name}-task-1" { - # The "driver" parameter specifies the task driver that should be used to - # run the task. - driver = "exec" - - %{ if use_host_volume } - volume_mount { - volume = "${job_name}-volume-1" - destination = "${volume_destination}" - read_only = false - } - %{ endif } - - %{ if use_vault_provider } - vault { - policies = "${vault_kv_policy_name}" - } - %{ endif } - - # The "config" stanza specifies the driver configuration, which is passed - # directly to the driver to start the task. The details of configurations - # are specific to each driver, so please see specific driver - # documentation for more information. - config { - command = "local/prometheus-${version}.linux-amd64/prometheus" - args = [ - "--config.file=secrets/prometheus.yml", - "--storage.tsdb.path=${volume_destination}prometheus/", - "--storage.tsdb.retention.time=7d" - ] - } - - # The artifact stanza instructs Nomad to fetch and unpack a remote resource, - # such as a file, tarball, or binary. Nomad downloads artifacts using the - # popular go-getter library, which permits downloading artifacts from a - # variety of locations using a URL as the input source. - # - # https://www.nomadproject.io/docs/job-specification/artifact - # - artifact { - source = "${url}" - } - - # The "template" stanza instructs Nomad to manage a template, such as - # a configuration file or script. This template can optionally pull data - # from Consul or Vault to populate runtime configuration data. - # - # https://www.nomadproject.io/docs/job-specification/template - # - template { - change_mode = "noop" - change_signal = "SIGINT" - destination = "secrets/alerts.yml" - left_delimiter = "{{{" - right_delimiter = "}}}" - data = < jenkins_job_success{id=~".*"} - for: 0m - labels: - severity: critical - annotations: - summary: "Jenkins Job Health detected high failure rate on jenkins jobs." - description: "Job: {{ $labels.id }}" - - alert: JenkinsJobHealthExporterUnstable - expr: jenkins_job_unstable{id=~".*"} > jenkins_job_success{id=~".*"} - for: 0m - labels: - severity: warning - annotations: - summary: "Jenkins Job Health detected high unstable rate on jenkins jobs." - description: "Job: {{ $labels.id }}" -- name: "Consul" - rules: - - alert: ConsulServiceHealthcheckFailed - expr: consul_catalog_service_node_healthy == 0 - for: 0m - labels: - severity: critical - annotations: - summary: "Consul service healthcheck failed (instance {{ $labels.instance }})." - description: "Service: `{{ $labels.service_name }}` Healthcheck: `{{ $labels.service_id }}`." - - alert: ConsulMissingMasterNode - expr: consul_raft_peers < 3 - for: 0m - labels: - severity: critical - annotations: - summary: "Consul missing master node (instance {{ $labels.instance }})." - description: "Numbers of consul raft peers should be 3, in order to preserve quorum." - - alert: ConsulAgentUnhealthy - expr: consul_health_node_status{status="critical"} == 1 - for: 0m - labels: - severity: critical - annotations: - summary: "Consul agent unhealthy (instance {{ $labels.instance }})." - description: "A Consul agent is down." -- name: "Hosts" - rules: - - alert: NodeDown - expr: up == 0 - for: 0m - labels: - severity: critical - annotations: - summary: "Prometheus target missing (instance {{ $labels.instance }})." - description: "A Prometheus target has disappeared. An exporter might be crashed." - - alert: HostOutOfMemory - expr: node_memory_MemAvailable_bytes / node_memory_MemTotal_bytes * 100 < 10 - for: 2m - labels: - severity: warning - annotations: - summary: "Host out of memory (instance {{ $labels.instance }})." - description: "Node memory is filling up (< 10% left)." - - alert: HostOomKillDetected - expr: increase(node_vmstat_oom_kill[1m]) > 0 - for: 0m - labels: - severity: warning - annotations: - summary: "Host OOM kill detected (instance {{ $labels.instance }})." - description: "OOM kill detected." - - alert: HostMemoryUnderMemoryPressure - expr: rate(node_vmstat_pgmajfault[1m]) > 1000 - for: 2m - labels: - severity: warning - annotations: - summary: "Host memory under memory pressure (instance {{ $labels.instance }})." - description: "The node is under heavy memory pressure. High rate of major page faults." - - alert: HostOutOfDiskSpace - expr: (node_filesystem_avail_bytes * 100) / node_filesystem_size_bytes < 10 and ON (instance, device, mountpoint) node_filesystem_readonly == 0 - for: 2m - labels: - severity: warning - annotations: - summary: "Host out of disk space (instance {{ $labels.instance }})." - description: "Disk is almost full (< 10% left)." - - alert: HostRaidDiskFailure - expr: node_md_disks{state="failed"} > 0 - for: 2m - labels: - severity: warning - annotations: - summary: "Host RAID disk failure (instance {{ $labels.instance }})." - description: "At least one device in RAID array on {{ $labels.instance }} failed. Array {{ $labels.md_device }} needs attention and possibly a disk swap." - - alert: HostConntrackLimit - expr: node_nf_conntrack_entries / node_nf_conntrack_entries_limit > 0.8 - for: 5m - labels: - severity: warning - annotations: - summary: "Host conntrack limit (instance {{ $labels.instance }})." - description: "The number of conntrack is approching limit." - - alert: HostNetworkInterfaceSaturated - expr: (rate(node_network_receive_bytes_total{device!~"^tap.*"}[1m]) + rate(node_network_transmit_bytes_total{device!~"^tap.*"}[1m])) / node_network_speed_bytes{device!~"^tap.*"} > 0.8 - for: 1m - labels: - severity: warning - annotations: - summary: "Host Network Interface Saturated (instance {{ $labels.instance }})." - description: "The network interface {{ $labels.interface }} on {{ $labels.instance }} is getting overloaded." - - alert: HostSystemdServiceCrashed - expr: node_systemd_unit_state{state="failed"} == 1 - for: 0m - labels: - severity: warning - annotations: - summary: "Host SystemD service crashed (instance {{ $labels.instance }})." - description: "SystemD service crashed." - - alert: HostEdacCorrectableErrorsDetected - expr: increase(node_edac_correctable_errors_total[1m]) > 0 - for: 0m - labels: - severity: info - annotations: - summary: "Host EDAC Correctable Errors detected (instance {{ $labels.instance }})." - description: '{{ $labels.instance }} has had {{ printf "%.0f" $value }} correctable memory errors reported by EDAC in the last 5 minutes.' - - alert: HostEdacUncorrectableErrorsDetected - expr: node_edac_uncorrectable_errors_total > 0 - for: 0m - labels: - severity: warning - annotations: - summary: "Host EDAC Uncorrectable Errors detected (instance {{ $labels.instance }})." - description: '{{ $labels.instance }} has had {{ printf "%.0f" $value }} uncorrectable memory errors reported by EDAC in the last 5 minutes.' -- name: "Min.io" - rules: - - alert: MinioDiskOffline - expr: minio_offline_disks > 0 - for: 0m - labels: - severity: critical - annotations: - summary: "Minio disk offline (instance {{ $labels.instance }})" - description: "Minio disk is offline." - - alert: MinioStorageSpaceExhausted - expr: minio_disk_storage_free_bytes / 1024 / 1024 / 1024 < 10 - for: 2m - labels: - severity: warning - annotations: - summary: "Minio storage space exhausted (instance {{ $labels.instance }})." - description: "Minio storage space is low (< 10 GB)." -- name: "Prometheus" - rules: - - alert: PrometheusConfigurationReloadFailure - expr: prometheus_config_last_reload_successful != 1 - for: 0m - labels: - severity: warning - annotations: - summary: "Prometheus configuration reload failure (instance {{ $labels.instance }})." - description: "Prometheus configuration reload error." - - alert: PrometheusTooManyRestarts - expr: changes(process_start_time_seconds{job=~"prometheus|pushgateway|alertmanager"}[15m]) > 2 - for: 0m - labels: - severity: warning - annotations: - summary: "Prometheus too many restarts (instance {{ $labels.instance }})." - description: "Prometheus has restarted more than twice in the last 15 minutes. It might be crashlooping." - - alert: PrometheusAlertmanagerConfigurationReloadFailure - expr: alertmanager_config_last_reload_successful != 1 - for: 0m - labels: - severity: warning - annotations: - summary: "Prometheus AlertManager configuration reload failure (instance {{ $labels.instance }})." - description: "AlertManager configuration reload error." - - alert: PrometheusRuleEvaluationFailures - expr: increase(prometheus_rule_evaluation_failures_total[3m]) > 0 - for: 0m - labels: - severity: critical - annotations: - summary: "Prometheus rule evaluation failures (instance {{ $labels.instance }})." - description: "Prometheus encountered {{ $value }} rule evaluation failures, leading to potentially ignored alerts." - - alert: PrometheusTargetScrapingSlow - expr: prometheus_target_interval_length_seconds{quantile="0.9"} > 60 - for: 5m - labels: - severity: warning - annotations: - summary: "Prometheus target scraping slow (instance {{ $labels.instance }})." - description: "Prometheus is scraping exporters slowly." - - alert: PrometheusTsdbCompactionsFailed - expr: increase(prometheus_tsdb_compactions_failed_total[1m]) > 0 - for: 0m - labels: - severity: critical - annotations: - summary: "Prometheus TSDB compactions failed (instance {{ $labels.instance }})." - description: "Prometheus encountered {{ $value }} TSDB compactions failures." - - alert: PrometheusTsdbHeadTruncationsFailed - expr: increase(prometheus_tsdb_head_truncations_failed_total[1m]) > 0 - for: 0m - labels: - severity: critical - annotations: - summary: "Prometheus TSDB head truncations failed (instance {{ $labels.instance }})." - description: "Prometheus encountered {{ $value }} TSDB head truncation failures." - - alert: PrometheusTsdbWalCorruptions - expr: increase(prometheus_tsdb_wal_corruptions_total[1m]) > 0 - for: 0m - labels: - severity: critical - annotations: - summary: "Prometheus TSDB WAL corruptions (instance {{ $labels.instance }})." - description: "Prometheus encountered {{ $value }} TSDB WAL corruptions." - - alert: PrometheusTsdbWalTruncationsFailed - expr: increase(prometheus_tsdb_wal_truncations_failed_total[1m]) > 0 - for: 0m - labels: - severity: critical - annotations: - summary: "Prometheus TSDB WAL truncations failed (instance {{ $labels.instance }})." - description: "Prometheus encountered {{ $value }} TSDB WAL truncation failures." -EOH - } - - template { - change_mode = "noop" - change_signal = "SIGINT" - destination = "secrets/prometheus.yml" - data = < ] +# +# # Certificate and key files for client cert authentication to the server. +# cert_file: +# key_file: +# +# # ServerName extension to indicate the name of the server. +# # http://tools.ietf.org/html/rfc4366#section-3.1 +# server_name: +# +# # Disable validation of the server certificate. +# insecure_skip_verify: true + +# The root route on which each incoming alert enters. +route: + receiver: '${slack_default_receiver}' + + # The labels by which incoming alerts are grouped together. For example, + # multiple alerts coming in for cluster=A and alertname=LatencyHigh would + # be batched into a single group. + # + # To aggregate by all possible labels use '...' as the sole label name. + # This effectively disables aggregation entirely, passing through all + # alerts as-is. This is unlikely to be what you want, unless you have + # a very low alert volume or your upstream notification system performs + # its own grouping. Example: group_by: [...] + group_by: ['alertname'] + + # When a new group of alerts is created by an incoming alert, wait at + # least 'group_wait' to send the initial notification. + # This way ensures that you get multiple alerts for the same group that start + # firing shortly after another are batched together on the first + # notification. + group_wait: 30s + + # When the first notification was sent, wait 'group_interval' to send a batch + # of new alerts that started firing for that group. + group_interval: 5m + + # If an alert has successfully been sent, wait 'repeat_interval' to + # resend them. + repeat_interval: 3h + + # All the above attributes are inherited by all child routes and can + # overwritten on each. + # The child route trees. + routes: + - match_re: + alertname: JenkinsJob.* + receiver: ${slack_jenkins_receiver} + routes: + - match: + severity: critical + receiver: '${slack_jenkins_receiver}' + + - match_re: + service: .* + receiver: ${slack_default_receiver} + routes: + - match: + severity: critical + receiver: '${slack_default_receiver}' + +# Inhibition rules allow to mute a set of alerts given that another alert is +# firing. +# We use this to mute any warning-level notifications if the same alert is +# already critical. +inhibit_rules: +- source_match: + severity: 'critical' + target_match: + severity: 'warning' + equal: ['alertname', 'instance'] + +receivers: +- name: '${slack_jenkins_receiver}' + slack_configs: + - api_url: 'https://hooks.slack.com/services/${slack_jenkins_api_key}' + channel: '#${slack_jenkins_channel}' + send_resolved: true + icon_url: https://avatars3.githubusercontent.com/u/3380462 + title: |- + [{{ .Status | toUpper }}{{ if eq .Status "firing" }}:{{ .Alerts.Firing | len }}{{ end }}] {{ .CommonLabels.alertname }} for {{ .CommonLabels.job }} + {{- if gt (len .CommonLabels) (len .GroupLabels) -}} + {{" "}}( + {{- with .CommonLabels.Remove .GroupLabels.Names }} + {{- range $index, $label := .SortedPairs -}} + {{ if $index }}, {{ end }} + {{- $label.Name }}="{{ $label.Value -}}" + {{- end }} + {{- end -}} + ) + {{- end }} + text: >- + {{ range .Alerts -}} + *Alert:* {{ .Annotations.summary }}{{ if .Labels.severity }} - `{{ .Labels.severity }}`{{ end }} + + *Description:* {{ .Annotations.description }} + + *Details:* + {{ range .Labels.SortedPairs }} • *{{ .Name }}:* `{{ .Value }}` + {{ end }} + {{ end }} + +- name: '${slack_default_receiver}' + slack_configs: + - api_url: 'https://hooks.slack.com/services/${slack_default_api_key}' + channel: '#${slack_default_channel}' + send_resolved: true + icon_url: https://avatars3.githubusercontent.com/u/3380462 + title: |- + [{{ .Status | toUpper }}{{ if eq .Status "firing" }}:{{ .Alerts.Firing | len }}{{ end }}] {{ .CommonLabels.alertname }} for {{ .CommonLabels.job }} + {{- if gt (len .CommonLabels) (len .GroupLabels) -}} + {{" "}}( + {{- with .CommonLabels.Remove .GroupLabels.Names }} + {{- range $index, $label := .SortedPairs -}} + {{ if $index }}, {{ end }} + {{- $label.Name }}="{{ $label.Value -}}" + {{- end }} + {{- end -}} + ) + {{- end }} + text: >- + {{ range .Alerts -}} + *Alert:* {{ .Annotations.summary }}{{ if .Labels.severity }} - `{{ .Labels.severity }}`{{ end }} + + *Description:* {{ .Annotations.description }} + + *Details:* + {{ range .Labels.SortedPairs }} • *{{ .Name }}:* `{{ .Value }}` + {{ end }} + {{ end }} +EOH + } + + # The service stanza instructs Nomad to register a service with Consul. + # + # https://www.nomadproject.io/docs/job-specification/service + # + service { + name = "${service_name}" + port = "${service_name}" + tags = [ "${service_name}$${NOMAD_ALLOC_INDEX}" ] + check { + name = "Alertmanager Check Live" + type = "http" + path = "/-/healthy" + interval = "10s" + timeout = "2s" + } + } + + # The "resources" stanza describes the requirements a task needs to + # execute. Resource requirements include memory, network, cpu, and more. + # This ensures the task will execute on a machine that contains enough + # resource capacity. + # + # https://www.nomadproject.io/docs/job-specification/resources + # + resources { + cpu = ${cpu} + memory = ${memory} + } + } + } +} diff --git a/fdio.infra.terraform/terraform-nomad-alertmanager/fdio/main.tf b/fdio.infra.terraform/terraform-nomad-alertmanager/fdio/main.tf new file mode 100644 index 0000000000..745e450a8c --- /dev/null +++ b/fdio.infra.terraform/terraform-nomad-alertmanager/fdio/main.tf @@ -0,0 +1,14 @@ +module "alertmanager" { + providers = { + nomad = nomad.yul1 + } + source = "../" + + # alertmanager + datacenters = ["yul1"] + slack_jenkins_api_key = "TE07RD1V1/B01U1NV9HV3/hKZXJJ74g2JcISq4K3QC1eG9" + slack_jenkins_channel = "fdio-jobs-monitoring" + slack_default_api_key = "TE07RD1V1/B01UUK23B6C/hZTcCu42FUv8d6rtirHtcYIi" + slack_default_channel = "fdio-infra-monitoring" + am_version = "0.23.0" +} \ No newline at end of file diff --git a/fdio.infra.terraform/terraform-nomad-alertmanager/fdio/providers.tf b/fdio.infra.terraform/terraform-nomad-alertmanager/fdio/providers.tf new file mode 100644 index 0000000000..42a6a45ce0 --- /dev/null +++ b/fdio.infra.terraform/terraform-nomad-alertmanager/fdio/providers.tf @@ -0,0 +1,13 @@ +provider "nomad" { + address = var.nomad_provider_address + alias = "yul1" + # ca_file = var.nomad_provider_ca_file + # cert_file = var.nomad_provider_cert_file + # key_file = var.nomad_provider_key_file +} + +provider "vault" { + address = var.vault_provider_address + skip_tls_verify = var.vault_provider_skip_tls_verify + token = var.vault_provider_token +} \ No newline at end of file diff --git a/fdio.infra.terraform/terraform-nomad-alertmanager/fdio/variables.tf b/fdio.infra.terraform/terraform-nomad-alertmanager/fdio/variables.tf new file mode 100644 index 0000000000..7d5be09d21 --- /dev/null +++ b/fdio.infra.terraform/terraform-nomad-alertmanager/fdio/variables.tf @@ -0,0 +1,47 @@ +variable "nomad_acl" { + description = "Nomad ACLs enabled/disabled." + type = bool + default = false +} + +variable "nomad_provider_address" { + description = "FD.io Nomad cluster address." + type = string + default = "http://10.32.8.14:4646" +} + +variable "nomad_provider_ca_file" { + description = "A local file path to a PEM-encoded certificate authority." + type = string + default = "/etc/nomad.d/ssl/nomad-ca.pem" +} + +variable "nomad_provider_cert_file" { + description = "A local file path to a PEM-encoded certificate." + type = string + default = "/etc/nomad.d/ssl/nomad-cli.pem" +} + +variable "nomad_provider_key_file" { + description = "A local file path to a PEM-encoded private key." + type = string + default = "/etc/nomad.d/ssl/nomad-cli-key.pem" +} + +variable "vault_provider_address" { + description = "Vault cluster address." + type = string + default = "http://10.30.51.28:8200" +} + +variable "vault_provider_skip_tls_verify" { + description = "Verification of the Vault server's TLS certificate." + type = bool + default = false +} + +variable "vault_provider_token" { + description = "Vault root token." + type = string + sensitive = true +} \ No newline at end of file diff --git a/fdio.infra.terraform/terraform-nomad-alertmanager/fdio/versions.tf b/fdio.infra.terraform/terraform-nomad-alertmanager/fdio/versions.tf new file mode 100644 index 0000000000..385c5c3f18 --- /dev/null +++ b/fdio.infra.terraform/terraform-nomad-alertmanager/fdio/versions.tf @@ -0,0 +1,17 @@ +terraform { + backend "consul" { + address = "10.32.8.14:8500" + scheme = "http" + path = "terraform/alertmanager" + } + required_providers { + nomad = { + source = "hashicorp/nomad" + version = ">= 1.4.16" + } + vault = { + version = ">= 3.2.1" + } + } + required_version = ">= 1.1.4" +} \ No newline at end of file diff --git a/fdio.infra.terraform/terraform-nomad-alertmanager/main.tf b/fdio.infra.terraform/terraform-nomad-alertmanager/main.tf new file mode 100644 index 0000000000..e8a1389150 --- /dev/null +++ b/fdio.infra.terraform/terraform-nomad-alertmanager/main.tf @@ -0,0 +1,48 @@ +locals { + datacenters = join(",", var.datacenters) + url = join("", + [ + "https://github.com", + "/prometheus/alertmanager/releases/download/", + "v${var.am_version}/", + "alertmanager-${var.am_version}.linux-amd64.tar.gz" + ] + ) +} + +resource "nomad_job" "nomad_job_alertmanager" { + jobspec = templatefile( + "${path.module}/conf/nomad/alertmanager.hcl.tftpl", + { + auto_promote = var.auto_promote, + auto_revert = var.auto_revert, + canary = var.canary, + cpu = var.cpu, + datacenters = local.datacenters, + group_count = var.group_count, + job_name = var.job_name, + max_parallel = var.max_parallel, + memory = var.memory + port = var.port, + region = var.region, + service_name = var.service_name, + slack_jenkins_api_key = var.slack_jenkins_api_key, + slack_jenkins_channel = var.slack_jenkins_channel, + slack_jenkins_receiver = var.slack_jenkins_receiver, + slack_default_api_key = var.slack_default_api_key, + slack_default_channel = var.slack_default_channel, + slack_default_receiver = var.slack_default_receiver, + url = local.url, + use_canary = var.use_canary, + use_host_volume = var.use_host_volume, + use_vault_provider = var.vault_secret.use_vault_provider, + vault_kv_policy_name = var.vault_secret.vault_kv_policy_name, + vault_kv_path = var.vault_secret.vault_kv_path, + vault_kv_field_access_key = var.vault_secret.vault_kv_field_access_key, + vault_kv_field_secret_key = var.vault_secret.vault_kv_field_secret_key, + version = var.am_version, + volume_destination = var.volume_destination, + volume_source = var.volume_source + }) + detach = false +} diff --git a/fdio.infra.terraform/terraform-nomad-alertmanager/variables.tf b/fdio.infra.terraform/terraform-nomad-alertmanager/variables.tf new file mode 100644 index 0000000000..e452598fa6 --- /dev/null +++ b/fdio.infra.terraform/terraform-nomad-alertmanager/variables.tf @@ -0,0 +1,157 @@ +# Nomad +variable "datacenters" { + description = "Specifies the list of DCs to be considered placing this task" + type = list(string) + default = ["dc1"] +} + +variable "region" { + description = "Specifies the list of DCs to be considered placing this task" + type = string + default = "global" +} + +variable "volume_source" { + description = "The name of the volume to request" + type = string + default = "persistence" +} + +# Alertmanager +variable "am_version" { + description = "Alertmanager version" + type = string + default = "0.21.0" +} + +variable "auto_promote" { + description = "Specifies if the job should auto-promote to the canary version" + type = bool + default = true +} + +variable "auto_revert" { + description = "Specifies if the job should auto-revert to the last stable job" + type = bool + default = true +} + +variable "canary" { + description = "Equal to the count of the task group allows blue/green depl." + type = number + default = 1 +} + +variable "cpu" { + description = "CPU allocation" + type = number + default = 1000 +} + +variable "group_count" { + description = "Specifies the number of the task groups running under this one" + type = number + default = 1 +} + +variable "job_name" { + description = "Specifies a name for the job" + type = string + default = "alertmanager" +} + +variable "max_parallel" { + description = "Specifies the maximum number of updates to perform in parallel" + type = number + default = 1 +} + +variable "memory" { + description = "Specifies the memory required in MB" + type = number + default = 1024 +} + +variable "port" { + description = "Specifies the static TCP/UDP port to allocate" + type = number + default = 9093 +} + +variable "service_name" { + description = "Specifies the name this service will be advertised in Consul" + type = string + default = "alertmanager" +} + +variable "use_canary" { + description = "Uses canary deployment" + type = bool + default = true +} + +variable "use_host_volume" { + description = "Use Nomad host volume feature" + type = bool + default = false +} + +variable "vault_secret" { + type = object({ + use_vault_provider = bool, + vault_kv_policy_name = string, + vault_kv_path = string, + vault_kv_field_access_key = string, + vault_kv_field_secret_key = string + }) + description = "Set of properties to be able to fetch secret from vault." + default = { + use_vault_provider = false + vault_kv_policy_name = "kv" + vault_kv_path = "secret/data/alertmanager" + vault_kv_field_access_key = "access_key" + vault_kv_field_secret_key = "secret_key" + } +} + +variable "volume_destination" { + description = "Specifies where the volume should be mounted inside the task" + type = string + default = "/data/" +} + +variable "slack_jenkins_api_key" { + description = "Alertmanager jenkins slack API key" + type = string + default = "XXXXXXXXX/XXXXXXXXXXX/XXXXXXXXXXXXXXXXXXXXXXXX" +} + +variable "slack_jenkins_receiver" { + description = "Alertmanager jenkins slack receiver" + type = string + default = "jenkins-slack-receiver" +} + +variable "slack_jenkins_channel" { + description = "Alertmanager jenkins slack channel" + type = string + default = "jenkins-channel" +} + +variable "slack_default_api_key" { + description = "Alertmanager default slack API key" + type = string + default = "XXXXXXXXX/XXXXXXXXXXX/XXXXXXXXXXXXXXXXXXXXXXXX" +} + +variable "slack_default_receiver" { + description = "Alertmanager default slack receiver" + type = string + default = "default-slack-receiver" +} + +variable "slack_default_channel" { + description = "Alertmanager default slack channel" + type = string + default = "default-channel" +} \ No newline at end of file diff --git a/fdio.infra.terraform/terraform-nomad-alertmanager/versions.tf b/fdio.infra.terraform/terraform-nomad-alertmanager/versions.tf new file mode 100644 index 0000000000..5f283ed4ea --- /dev/null +++ b/fdio.infra.terraform/terraform-nomad-alertmanager/versions.tf @@ -0,0 +1,9 @@ +terraform { + required_providers { + nomad = { + source = "hashicorp/nomad" + version = ">= 1.4.16" + } + } + required_version = ">= 1.1.4" +} \ No newline at end of file diff --git a/fdio.infra.terraform/terraform-nomad-loki/conf/nomad/loki.hcl.tftpl b/fdio.infra.terraform/terraform-nomad-loki/conf/nomad/loki.hcl.tftpl new file mode 100644 index 0000000000..7b38437566 --- /dev/null +++ b/fdio.infra.terraform/terraform-nomad-loki/conf/nomad/loki.hcl.tftpl @@ -0,0 +1,261 @@ +job "${job_name}" { + # The "region" parameter specifies the region in which to execute the job. + # If omitted, this inherits the default region name of "global". + # region = "${region}" + + # The "datacenters" parameter specifies the list of datacenters which should + # be considered when placing this task. This must be provided. + datacenters = "${datacenters}" + + # The "type" parameter controls the type of job, which impacts the scheduler's + # decision on placement. This configuration is optional and defaults to + # "service". For a full list of job types and their differences, please see + # the online documentation. + # + # https://www.nomadproject.io/docs/jobspec/schedulers + # + type = "service" + + update { + # The "max_parallel" parameter specifies the maximum number of updates to + # perform in parallel. In this case, this specifies to update a single task + # at a time. + max_parallel = ${max_parallel} + + health_check = "checks" + + # The "min_healthy_time" parameter specifies the minimum time the allocation + # must be in the healthy state before it is marked as healthy and unblocks + # further allocations from being updated. + min_healthy_time = "10s" + + # The "healthy_deadline" parameter specifies the deadline in which the + # allocation must be marked as healthy after which the allocation is + # automatically transitioned to unhealthy. Transitioning to unhealthy will + # fail the deployment and potentially roll back the job if "auto_revert" is + # set to true. + healthy_deadline = "3m" + + # The "progress_deadline" parameter specifies the deadline in which an + # allocation must be marked as healthy. The deadline begins when the first + # allocation for the deployment is created and is reset whenever an allocation + # as part of the deployment transitions to a healthy state. If no allocation + # transitions to the healthy state before the progress deadline, the + # deployment is marked as failed. + progress_deadline = "10m" + +%{ if use_canary } + # The "canary" parameter specifies that changes to the job that would result + # in destructive updates should create the specified number of canaries + # without stopping any previous allocations. Once the operator determines the + # canaries are healthy, they can be promoted which unblocks a rolling update + # of the remaining allocations at a rate of "max_parallel". + # + # Further, setting "canary" equal to the count of the task group allows + # blue/green deployments. When the job is updated, a full set of the new + # version is deployed and upon promotion the old version is stopped. + canary = ${canary} + + # Specifies if the job should auto-promote to the canary version when all + # canaries become healthy during a deployment. Defaults to false which means + # canaries must be manually updated with the nomad deployment promote + # command. + auto_promote = ${auto_promote} + + # The "auto_revert" parameter specifies if the job should auto-revert to the + # last stable job on deployment failure. A job is marked as stable if all the + # allocations as part of its deployment were marked healthy. + auto_revert = ${auto_revert} +%{ endif } + } + + # The "group" stanza defines a series of tasks that should be co-located on + # the same Nomad client. Any task within a group will be placed on the same + # client. + # + # https://www.nomadproject.io/docs/job-specification/group + # + group "${job_name}-group-1" { + # The "count" parameter specifies the number of the task groups that should + # be running under this group. This value must be non-negative and defaults + # to 1. + count = ${group_count} + + # The volume stanza allows the group to specify that it requires a given + # volume from the cluster. The key of the stanza is the name of the volume + # as it will be exposed to task configuration. + # + # https://www.nomadproject.io/docs/job-specification/volume + %{ if use_host_volume } + volume "${job_name}-volume-1" { + type = "host" + read_only = false + source = "${volume_source}" + } + %{ endif } + + # The restart stanza configures a tasks's behavior on task failure. Restarts + # happen on the client that is running the task. + # + # https://www.nomadproject.io/docs/job-specification/restart + # + restart { + interval = "30m" + attempts = 40 + delay = "15s" + mode = "delay" + } + + # The constraint allows restricting the set of eligible nodes. Constraints + # may filter on attributes or client metadata. + # + # https://www.nomadproject.io/docs/job-specification/constraint + # + constraint { + attribute = "$${attr.cpu.arch}" + operator = "!=" + value = "arm64" + } + + constraint { + attribute = "$${node.class}" + value = "builder" + } + + # The network stanza specifies the networking requirements for the task + # group, including the network mode and port allocations. When scheduling + # jobs in Nomad they are provisioned across your fleet of machines along + # with other jobs and services. Because you don't know in advance what host + # your job will be provisioned on, Nomad will provide your tasks with + # network configuration when they start up. + # + # https://www.nomadproject.io/docs/job-specification/network + # + network { + port "${service_name}" { + static = ${port} + to = ${port} + } + } + + # The "task" stanza creates an individual unit of work, such as a Docker + # container, web application, or batch processing. + # + # https://www.nomadproject.io/docs/job-specification/task + # + task "${job_name}-task-1" { + # The "driver" parameter specifies the task driver that should be used to + # run the task. + driver = "exec" + + %{ if use_host_volume } + volume_mount { + volume = "${job_name}-volume-1" + destination = "${volume_destination}" + read_only = false + } + %{ endif } + + %{ if use_vault_provider } + vault { + policies = "${vault_kv_policy_name}" + } + %{ endif } + + # The "config" stanza specifies the driver configuration, which is passed + # directly to the driver to start the task. The details of configurations + # are specific to each driver, so please see specific driver + # documentation for more information. + config { + command = "local/loki-linux-amd64" + } + + # The artifact stanza instructs Nomad to fetch and unpack a remote resource, + # such as a file, tarball, or binary. Nomad downloads artifacts using the + # popular go-getter library, which permits downloading artifacts from a + # variety of locations using a URL as the input source. + # + # https://www.nomadproject.io/docs/job-specification/artifact + # + artifact { + source = "${url}" + args = [ + "-config.file secrets/config.yml" + ] + } + + template { + change_mode = "noop" + change_signal = "SIGINT" + destination = "secrets/loki.yml" + data = < jenkins_job_success{id=~".*"} + for: 0m + labels: + severity: critical + annotations: + summary: "Jenkins Job Health detected high failure rate on jenkins jobs." + description: "Job: {{ $labels.id }}" + - alert: JenkinsJobHealthExporterUnstable + expr: jenkins_job_unstable{id=~".*"} > jenkins_job_success{id=~".*"} + for: 0m + labels: + severity: warning + annotations: + summary: "Jenkins Job Health detected high unstable rate on jenkins jobs." + description: "Job: {{ $labels.id }}" +- name: "Consul" + rules: + - alert: ConsulServiceHealthcheckFailed + expr: consul_catalog_service_node_healthy == 0 + for: 0m + labels: + severity: critical + annotations: + summary: "Consul service healthcheck failed (instance {{ $labels.instance }})." + description: "Service: `{{ $labels.service_name }}` Healthcheck: `{{ $labels.service_id }}`." + - alert: ConsulMissingMasterNode + expr: consul_raft_peers < 3 + for: 0m + labels: + severity: critical + annotations: + summary: "Consul missing master node (instance {{ $labels.instance }})." + description: "Numbers of consul raft peers should be 3, in order to preserve quorum." + - alert: ConsulAgentUnhealthy + expr: consul_health_node_status{status="critical"} == 1 + for: 0m + labels: + severity: critical + annotations: + summary: "Consul agent unhealthy (instance {{ $labels.instance }})." + description: "A Consul agent is down." +- name: "Hosts" + rules: + - alert: NodeDown + expr: up == 0 + for: 0m + labels: + severity: critical + annotations: + summary: "Prometheus target missing (instance {{ $labels.instance }})." + description: "A Prometheus target has disappeared. An exporter might be crashed." + - alert: HostOutOfMemory + expr: node_memory_MemAvailable_bytes / node_memory_MemTotal_bytes * 100 < 10 + for: 2m + labels: + severity: warning + annotations: + summary: "Host out of memory (instance {{ $labels.instance }})." + description: "Node memory is filling up (< 10% left)." + - alert: HostOomKillDetected + expr: increase(node_vmstat_oom_kill[1m]) > 0 + for: 0m + labels: + severity: warning + annotations: + summary: "Host OOM kill detected (instance {{ $labels.instance }})." + description: "OOM kill detected." + - alert: HostMemoryUnderMemoryPressure + expr: rate(node_vmstat_pgmajfault[1m]) > 1000 + for: 2m + labels: + severity: warning + annotations: + summary: "Host memory under memory pressure (instance {{ $labels.instance }})." + description: "The node is under heavy memory pressure. High rate of major page faults." + - alert: HostOutOfDiskSpace + expr: (node_filesystem_avail_bytes * 100) / node_filesystem_size_bytes < 10 and ON (instance, device, mountpoint) node_filesystem_readonly == 0 + for: 2m + labels: + severity: warning + annotations: + summary: "Host out of disk space (instance {{ $labels.instance }})." + description: "Disk is almost full (< 10% left)." + - alert: HostRaidDiskFailure + expr: node_md_disks{state="failed"} > 0 + for: 2m + labels: + severity: warning + annotations: + summary: "Host RAID disk failure (instance {{ $labels.instance }})." + description: "At least one device in RAID array on {{ $labels.instance }} failed. Array {{ $labels.md_device }} needs attention and possibly a disk swap." + - alert: HostConntrackLimit + expr: node_nf_conntrack_entries / node_nf_conntrack_entries_limit > 0.8 + for: 5m + labels: + severity: warning + annotations: + summary: "Host conntrack limit (instance {{ $labels.instance }})." + description: "The number of conntrack is approching limit." + - alert: HostNetworkInterfaceSaturated + expr: (rate(node_network_receive_bytes_total{device!~"^tap.*"}[1m]) + rate(node_network_transmit_bytes_total{device!~"^tap.*"}[1m])) / node_network_speed_bytes{device!~"^tap.*"} > 0.8 + for: 1m + labels: + severity: warning + annotations: + summary: "Host Network Interface Saturated (instance {{ $labels.instance }})." + description: "The network interface {{ $labels.interface }} on {{ $labels.instance }} is getting overloaded." + - alert: HostSystemdServiceCrashed + expr: node_systemd_unit_state{state="failed"} == 1 + for: 0m + labels: + severity: warning + annotations: + summary: "Host SystemD service crashed (instance {{ $labels.instance }})." + description: "SystemD service crashed." + - alert: HostEdacCorrectableErrorsDetected + expr: increase(node_edac_correctable_errors_total[1m]) > 0 + for: 0m + labels: + severity: info + annotations: + summary: "Host EDAC Correctable Errors detected (instance {{ $labels.instance }})." + description: '{{ $labels.instance }} has had {{ printf "%.0f" $value }} correctable memory errors reported by EDAC in the last 5 minutes.' + - alert: HostEdacUncorrectableErrorsDetected + expr: node_edac_uncorrectable_errors_total > 0 + for: 0m + labels: + severity: warning + annotations: + summary: "Host EDAC Uncorrectable Errors detected (instance {{ $labels.instance }})." + description: '{{ $labels.instance }} has had {{ printf "%.0f" $value }} uncorrectable memory errors reported by EDAC in the last 5 minutes.' +- name: "Min.io" + rules: + - alert: MinioDiskOffline + expr: minio_offline_disks > 0 + for: 0m + labels: + severity: critical + annotations: + summary: "Minio disk offline (instance {{ $labels.instance }})" + description: "Minio disk is offline." + - alert: MinioStorageSpaceExhausted + expr: minio_disk_storage_free_bytes / 1024 / 1024 / 1024 < 10 + for: 2m + labels: + severity: warning + annotations: + summary: "Minio storage space exhausted (instance {{ $labels.instance }})." + description: "Minio storage space is low (< 10 GB)." +- name: "Prometheus" + rules: + - alert: PrometheusConfigurationReloadFailure + expr: prometheus_config_last_reload_successful != 1 + for: 0m + labels: + severity: warning + annotations: + summary: "Prometheus configuration reload failure (instance {{ $labels.instance }})." + description: "Prometheus configuration reload error." + - alert: PrometheusTooManyRestarts + expr: changes(process_start_time_seconds{job=~"prometheus|pushgateway|alertmanager"}[15m]) > 2 + for: 0m + labels: + severity: warning + annotations: + summary: "Prometheus too many restarts (instance {{ $labels.instance }})." + description: "Prometheus has restarted more than twice in the last 15 minutes. It might be crashlooping." + - alert: PrometheusAlertmanagerConfigurationReloadFailure + expr: alertmanager_config_last_reload_successful != 1 + for: 0m + labels: + severity: warning + annotations: + summary: "Prometheus AlertManager configuration reload failure (instance {{ $labels.instance }})." + description: "AlertManager configuration reload error." + - alert: PrometheusRuleEvaluationFailures + expr: increase(prometheus_rule_evaluation_failures_total[3m]) > 0 + for: 0m + labels: + severity: critical + annotations: + summary: "Prometheus rule evaluation failures (instance {{ $labels.instance }})." + description: "Prometheus encountered {{ $value }} rule evaluation failures, leading to potentially ignored alerts." + - alert: PrometheusTargetScrapingSlow + expr: prometheus_target_interval_length_seconds{quantile="0.9"} > 60 + for: 5m + labels: + severity: warning + annotations: + summary: "Prometheus target scraping slow (instance {{ $labels.instance }})." + description: "Prometheus is scraping exporters slowly." + - alert: PrometheusTsdbCompactionsFailed + expr: increase(prometheus_tsdb_compactions_failed_total[1m]) > 0 + for: 0m + labels: + severity: critical + annotations: + summary: "Prometheus TSDB compactions failed (instance {{ $labels.instance }})." + description: "Prometheus encountered {{ $value }} TSDB compactions failures." + - alert: PrometheusTsdbHeadTruncationsFailed + expr: increase(prometheus_tsdb_head_truncations_failed_total[1m]) > 0 + for: 0m + labels: + severity: critical + annotations: + summary: "Prometheus TSDB head truncations failed (instance {{ $labels.instance }})." + description: "Prometheus encountered {{ $value }} TSDB head truncation failures." + - alert: PrometheusTsdbWalCorruptions + expr: increase(prometheus_tsdb_wal_corruptions_total[1m]) > 0 + for: 0m + labels: + severity: critical + annotations: + summary: "Prometheus TSDB WAL corruptions (instance {{ $labels.instance }})." + description: "Prometheus encountered {{ $value }} TSDB WAL corruptions." + - alert: PrometheusTsdbWalTruncationsFailed + expr: increase(prometheus_tsdb_wal_truncations_failed_total[1m]) > 0 + for: 0m + labels: + severity: critical + annotations: + summary: "Prometheus TSDB WAL truncations failed (instance {{ $labels.instance }})." + description: "Prometheus encountered {{ $value }} TSDB WAL truncation failures." +EOH + } + + template { + change_mode = "noop" + change_signal = "SIGINT" + destination = "secrets/prometheus.yml" + data = <