From a44eef233de959c5679602dc6cd1ed866d6abc14 Mon Sep 17 00:00:00 2001 From: pmikus Date: Thu, 7 Jan 2021 14:27:38 +0000 Subject: Infra: Monitoring capability + Monitoring SOA + Nomad alertmanager job + Nomad prometheus job + Nomad grafana job Signed-off-by: pmikus Change-Id: I0b32e9c87276ba1a2d4a5322816f3473c737eae2 --- terraform-ci-infra/1n_nmd/.terraform.lock.hcl | 0 .../alertmanager/conf/nomad/alertmanager.hcl | 333 + terraform-ci-infra/1n_nmd/alertmanager/main.tf | 37 + .../1n_nmd/alertmanager/variables.tf | 84 + .../1n_nmd/exporter/conf/nomad/exporter.hcl | 587 + terraform-ci-infra/1n_nmd/exporter/main.tf | 64 + terraform-ci-infra/1n_nmd/exporter/variables.tf | 76 + .../grafana/conf/blackbox_exporter_http.json | 1030 ++ .../grafana/conf/blackbox_exporter_icmp.json | 368 + terraform-ci-infra/1n_nmd/grafana/conf/consul.json | 1438 ++ .../1n_nmd/grafana/conf/docker_cadvisor.json | 2040 +++ .../1n_nmd/grafana/conf/node_exporter.json | 13696 +++++++++++++++++++ terraform-ci-infra/1n_nmd/grafana/conf/nomad.json | 869 ++ .../1n_nmd/grafana/conf/nomad/grafana.hcl | 331 + .../1n_nmd/grafana/conf/prometheus.json | 3055 +++++ terraform-ci-infra/1n_nmd/grafana/main.tf | 24 + terraform-ci-infra/1n_nmd/grafana/variables.tf | 66 + terraform-ci-infra/1n_nmd/main.tf | 186 +- .../1n_nmd/prometheus/conf/nomad/prometheus.hcl | 569 + terraform-ci-infra/1n_nmd/prometheus/main.tf | 37 + terraform-ci-infra/1n_nmd/prometheus/variables.tf | 84 + terraform-ci-infra/1n_nmd/terraform.tfstate | 558 +- terraform-ci-infra/1n_nmd/terraform.tfstate.backup | 625 +- 23 files changed, 26096 insertions(+), 61 deletions(-) mode change 100755 => 100644 terraform-ci-infra/1n_nmd/.terraform.lock.hcl create mode 100644 terraform-ci-infra/1n_nmd/alertmanager/conf/nomad/alertmanager.hcl create mode 100644 terraform-ci-infra/1n_nmd/alertmanager/main.tf create mode 100644 terraform-ci-infra/1n_nmd/alertmanager/variables.tf create mode 100644 terraform-ci-infra/1n_nmd/exporter/conf/nomad/exporter.hcl create mode 100644 terraform-ci-infra/1n_nmd/exporter/main.tf create mode 100644 terraform-ci-infra/1n_nmd/exporter/variables.tf create mode 100644 terraform-ci-infra/1n_nmd/grafana/conf/blackbox_exporter_http.json create mode 100644 terraform-ci-infra/1n_nmd/grafana/conf/blackbox_exporter_icmp.json create mode 100644 terraform-ci-infra/1n_nmd/grafana/conf/consul.json create mode 100644 terraform-ci-infra/1n_nmd/grafana/conf/docker_cadvisor.json create mode 100644 terraform-ci-infra/1n_nmd/grafana/conf/node_exporter.json create mode 100644 terraform-ci-infra/1n_nmd/grafana/conf/nomad.json create mode 100644 terraform-ci-infra/1n_nmd/grafana/conf/nomad/grafana.hcl create mode 100644 terraform-ci-infra/1n_nmd/grafana/conf/prometheus.json create mode 100644 terraform-ci-infra/1n_nmd/grafana/main.tf create mode 100644 terraform-ci-infra/1n_nmd/grafana/variables.tf create mode 100644 terraform-ci-infra/1n_nmd/prometheus/conf/nomad/prometheus.hcl create mode 100644 terraform-ci-infra/1n_nmd/prometheus/main.tf create mode 100644 terraform-ci-infra/1n_nmd/prometheus/variables.tf (limited to 'terraform-ci-infra/1n_nmd') diff --git a/terraform-ci-infra/1n_nmd/.terraform.lock.hcl b/terraform-ci-infra/1n_nmd/.terraform.lock.hcl old mode 100755 new mode 100644 diff --git a/terraform-ci-infra/1n_nmd/alertmanager/conf/nomad/alertmanager.hcl b/terraform-ci-infra/1n_nmd/alertmanager/conf/nomad/alertmanager.hcl new file mode 100644 index 0000000000..4560cf07ab --- /dev/null +++ b/terraform-ci-infra/1n_nmd/alertmanager/conf/nomad/alertmanager.hcl @@ -0,0 +1,333 @@ +job "${job_name}" { + # The "region" parameter specifies the region in which to execute the job. + # If omitted, this inherits the default region name of "global". + # region = "global" + # + # The "datacenters" parameter specifies the list of datacenters which should + # be considered when placing this task. This must be provided. + datacenters = "${datacenters}" + + # The "type" parameter controls the type of job, which impacts the scheduler's + # decision on placement. This configuration is optional and defaults to + # "service". For a full list of job types and their differences, please see + # the online documentation. + # + # For more information, please see the online documentation at: + # + # https://www.nomadproject.io/docs/jobspec/schedulers + # + type = "service" + + update { + # The "max_parallel" parameter specifies the maximum number of updates to + # perform in parallel. In this case, this specifies to update a single task + # at a time. + max_parallel = 1 + + health_check = "checks" + + # The "min_healthy_time" parameter specifies the minimum time the allocation + # must be in the healthy state before it is marked as healthy and unblocks + # further allocations from being updated. + min_healthy_time = "10s" + + # The "healthy_deadline" parameter specifies the deadline in which the + # allocation must be marked as healthy after which the allocation is + # automatically transitioned to unhealthy. Transitioning to unhealthy will + # fail the deployment and potentially roll back the job if "auto_revert" is + # set to true. + healthy_deadline = "3m" + + # The "progress_deadline" parameter specifies the deadline in which an + # allocation must be marked as healthy. The deadline begins when the first + # allocation for the deployment is created and is reset whenever an allocation + # as part of the deployment transitions to a healthy state. If no allocation + # transitions to the healthy state before the progress deadline, the + # deployment is marked as failed. + progress_deadline = "10m" + +%{ if use_canary } + # The "canary" parameter specifies that changes to the job that would result + # in destructive updates should create the specified number of canaries + # without stopping any previous allocations. Once the operator determines the + # canaries are healthy, they can be promoted which unblocks a rolling update + # of the remaining allocations at a rate of "max_parallel". + # + # Further, setting "canary" equal to the count of the task group allows + # blue/green deployments. When the job is updated, a full set of the new + # version is deployed and upon promotion the old version is stopped. + canary = 1 + + # Specifies if the job should auto-promote to the canary version when all + # canaries become healthy during a deployment. Defaults to false which means + # canaries must be manually updated with the nomad deployment promote + # command. + auto_promote = true + + # The "auto_revert" parameter specifies if the job should auto-revert to the + # last stable job on deployment failure. A job is marked as stable if all the + # allocations as part of its deployment were marked healthy. + auto_revert = true +%{ endif } + } + + # The "group" stanza defines a series of tasks that should be co-located on + # the same Nomad client. Any task within a group will be placed on the same + # client. + # + # For more information and examples on the "group" stanza, please see + # the online documentation at: + # + # https://www.nomadproject.io/docs/job-specification/group + # + group "prod-group1-${service_name}" { + # The "count" parameter specifies the number of the task groups that should + # be running under this group. This value must be non-negative and defaults + # to 1. + count = ${group_count} + + # The constraint allows restricting the set of eligible nodes. Constraints + # may filter on attributes or client metadata. + # + # For more information and examples on the "volume" stanza, please see + # the online documentation at: + # + # https://www.nomadproject.io/docs/job-specification/constraint + # + constraint { + attribute = "$${attr.cpu.arch}" + operator = "!=" + value = "arm64" + } + + # The "task" stanza creates an individual unit of work, such as a Docker + # container, web application, or batch processing. + # + # For more information and examples on the "task" stanza, please see + # the online documentation at: + # + # https://www.nomadproject.io/docs/job-specification/task + # + task "prod-task1-${service_name}" { + # The "driver" parameter specifies the task driver that should be used to + # run the task. + driver = "exec" + + %{ if use_vault_provider } + vault { + policies = "${vault_kv_policy_name}" + } + %{ endif } + + # The "config" stanza specifies the driver configuration, which is passed + # directly to the driver to start the task. The details of configurations + # are specific to each driver, so please see specific driver + # documentation for more information. + config { + command = "local/alertmanager-${version}.linux-amd64/alertmanager" + args = [ + "--config.file=secrets/alertmanager.yml" + ] + } + + # The artifact stanza instructs Nomad to fetch and unpack a remote resource, + # such as a file, tarball, or binary. Nomad downloads artifacts using the + # popular go-getter library, which permits downloading artifacts from a + # variety of locations using a URL as the input source. + # + # For more information and examples on the "artifact" stanza, please see + # the online documentation at: + # + # https://www.nomadproject.io/docs/job-specification/artifact + # + artifact { + source = "${url}" + } + + # The "template" stanza instructs Nomad to manage a template, such as + # a configuration file or script. This template can optionally pull data + # from Consul or Vault to populate runtime configuration data. + # + # For more information and examples on the "template" stanza, please see + # the online documentation at: + # + # https://www.nomadproject.io/docs/job-specification/template + # + template { + change_mode = "noop" + change_signal = "SIGINT" + destination = "secrets/alertmanager.yml" + left_delimiter = "{{{" + right_delimiter = "}}}" + data = < ] +# +# # Certificate and key files for client cert authentication to the server. +# cert_file: +# key_file: +# +# # ServerName extension to indicate the name of the server. +# # http://tools.ietf.org/html/rfc4366#section-3.1 +# server_name: +# +# # Disable validation of the server certificate. +# insecure_skip_verify: true + +# The root route on which each incoming alert enters. +route: + receiver: '${default_receiver}' + + # The labels by which incoming alerts are grouped together. For example, + # multiple alerts coming in for cluster=A and alertname=LatencyHigh would + # be batched into a single group. + # + # To aggregate by all possible labels use '...' as the sole label name. + # This effectively disables aggregation entirely, passing through all + # alerts as-is. This is unlikely to be what you want, unless you have + # a very low alert volume or your upstream notification system performs + # its own grouping. Example: group_by: [...] + group_by: ['alertname', 'cluster', 'service'] + + # When a new group of alerts is created by an incoming alert, wait at + # least 'group_wait' to send the initial notification. + # This way ensures that you get multiple alerts for the same group that start + # firing shortly after another are batched together on the first + # notification. + group_wait: 30s + + # When the first notification was sent, wait 'group_interval' to send a batch + # of new alerts that started firing for that group. + group_interval: 5m + + # If an alert has successfully been sent, wait 'repeat_interval' to + # resend them. + repeat_interval: 3h + + # All the above attributes are inherited by all child routes and can + # overwritten on each. + # The child route trees. + routes: + # This routes performs a regular expression match on alert labels to + # catch alerts that are related to a list of services. + - match_re: + service: .* + receiver: ${default_receiver} + # The service has a sub-route for critical alerts, any alerts + # that do not match, i.e. severity != critical, fall-back to the + # parent node and are sent to 'team-X-mails' + routes: + - match: + severity: critical + receiver: '${default_receiver}' + +# Inhibition rules allow to mute a set of alerts given that another alert is +# firing. +# We use this to mute any warning-level notifications if the same alert is +# already critical. +inhibit_rules: +- source_match: + severity: 'critical' + target_match: + severity: 'warning' + # Apply inhibition if the alertname is the same. + # CAUTION: + # If all label names listed in `equal` are missing + # from both the source and target alerts, + # the inhibition rule will apply! + equal: ['alertname', 'cluster', 'service'] + +receivers: +- name: '${default_receiver}' + slack_configs: + - channel: '#${slack_channel}' + send_resolved: true + icon_url: https://avatars3.githubusercontent.com/u/3380462 + title: |- + [{{ .Status | toUpper }}{{ if eq .Status "firing" }}:{{ .Alerts.Firing | len }}{{ end }}] {{ .CommonLabels.alertname }} for {{ .CommonLabels.job }} + {{- if gt (len .CommonLabels) (len .GroupLabels) -}} + {{" "}}( + {{- with .CommonLabels.Remove .GroupLabels.Names }} + {{- range $index, $label := .SortedPairs -}} + {{ if $index }}, {{ end }} + {{- $label.Name }}="{{ $label.Value -}}" + {{- end }} + {{- end -}} + ) + {{- end }} + text: >- + {{ range .Alerts -}} + *Alert:* {{ .Annotations.summary }}{{ if .Labels.severity }} - `{{ .Labels.severity }}`{{ end }} + + *Description:* {{ .Annotations.description }} + + *Details:* + {{ range .Labels.SortedPairs }} • *{{ .Name }}:* `{{ .Value }}` + {{ end }} + {{ end }} +EOH + } + + # The service stanza instructs Nomad to register a service with Consul. + # + # For more information and examples on the "task" stanza, please see + # the online documentation at: + # + # https://www.nomadproject.io/docs/job-specification/service + # + service { + name = "${service_name}" + port = "${service_name}" + tags = [ "${service_name}$${NOMAD_ALLOC_INDEX}" ] + check { + name = "Alertmanager Check Live" + type = "http" + path = "/-/healthy" + interval = "10s" + timeout = "2s" + } + } + + # The "resources" stanza describes the requirements a task needs to + # execute. Resource requirements include memory, network, cpu, and more. + # This ensures the task will execute on a machine that contains enough + # resource capacity. + # + # For more information and examples on the "resources" stanza, please see + # the online documentation at: + # + # https://www.nomadproject.io/docs/job-specification/resources + # + resources { + cpu = ${cpu} + memory = ${mem} + # The network stanza specifies the networking requirements for the task + # group, including the network mode and port allocations. When scheduling + # jobs in Nomad they are provisioned across your fleet of machines along + # with other jobs and services. Because you don't know in advance what host + # your job will be provisioned on, Nomad will provide your tasks with + # network configuration when they start up. + # + # For more information and examples on the "template" stanza, please see + # the online documentation at: + # + # https://www.nomadproject.io/docs/job-specification/network + # + network { + port "${service_name}" { + static = ${port} + } + } + } + } + } +} \ No newline at end of file diff --git a/terraform-ci-infra/1n_nmd/alertmanager/main.tf b/terraform-ci-infra/1n_nmd/alertmanager/main.tf new file mode 100644 index 0000000000..411c78a558 --- /dev/null +++ b/terraform-ci-infra/1n_nmd/alertmanager/main.tf @@ -0,0 +1,37 @@ +locals { + datacenters = join(",", var.nomad_datacenters) + + alertmanager_url = join("", + [ + "https://github.com", + "/prometheus/alertmanager/releases/download/", + "v${var.alertmanager_version}/", + "alertmanager-${var.alertmanager_version}.linux-amd64.tar.gz" + ] + ) +} + +data "template_file" "nomad_job_alertmanager" { + template = file("${path.module}/conf/nomad/alertmanager.hcl") + vars = { + datacenters = local.datacenters + url = local.alertmanager_url + job_name = var.alertmanager_job_name + use_canary = var.alertmanager_use_canary + group_count = var.alertmanager_group_count + service_name = var.alertmanager_service_name + use_vault_provider = var.alertmanager_vault_secret.use_vault_provider + version = var.alertmanager_version + cpu = var.alertmanager_cpu + mem = var.alertmanager_mem + port = var.alertmanager_port + slack_api_url = var.alertmanager_slack_api_url + slack_channel = var.alertmanager_slack_channel + default_receiver = var.alertmanager_default_receiver + } +} + +resource "nomad_job" "nomad_job_alertmanager" { + jobspec = data.template_file.nomad_job_alertmanager.rendered + detach = false +} \ No newline at end of file diff --git a/terraform-ci-infra/1n_nmd/alertmanager/variables.tf b/terraform-ci-infra/1n_nmd/alertmanager/variables.tf new file mode 100644 index 0000000000..ebd862123d --- /dev/null +++ b/terraform-ci-infra/1n_nmd/alertmanager/variables.tf @@ -0,0 +1,84 @@ +# Nomad +variable "nomad_datacenters" { + description = "Nomad data centers" + type = list(string) + default = [ "dc1" ] +} + +# Alermanager +variable "alertmanager_job_name" { + description = "Job name" + type = string + default = "alertmanager" +} + +variable "alertmanager_group_count" { + description = "Number of group instances" + type = number + default = 1 +} + +variable "alertmanager_service_name" { + description = "Service name" + type = string + default = "alertmanager" +} + +variable "alertmanager_version" { + description = "Version" + type = string + default = "0.21.0" +} + +variable "alertmanager_use_canary" { + description = "Uses canary deployment" + type = bool + default = false +} + +variable "alertmanager_vault_secret" { + description = "Set of properties to be able to fetch secret from vault" + type = object({ + use_vault_provider = bool, + vault_kv_policy_name = string, + vault_kv_path = string, + vault_kv_field_access_key = string, + vault_kv_field_secret_key = string + }) +} + +variable "alertmanager_cpu" { + description = "CPU allocation" + type = number + default = 1000 +} + +variable "alertmanager_mem" { + description = "RAM allocation" + type = number + default = 1024 +} + +variable "alertmanager_port" { + description = "TCP allocation" + type = number + default = 9093 +} + +variable "alertmanager_default_receiver" { + description = "Alertmanager default receiver" + type = string + default = "default-receiver" +} + +variable "alertmanager_slack_api_url" { + description = "Alertmanager slack API URL" + type = string + default = "https://hooks.slack.com/services/XXXXXXXXX/XXXXXXXXXXX/XXXXXXXXXXXXXXXXXXXXXXXX" +} + +variable "alertmanager_slack_channel" { + description = "Alertmanager slack channel" + type = string + default = "slack-channel" +} \ No newline at end of file diff --git a/terraform-ci-infra/1n_nmd/exporter/conf/nomad/exporter.hcl b/terraform-ci-infra/1n_nmd/exporter/conf/nomad/exporter.hcl new file mode 100644 index 0000000000..4fd0768ae7 --- /dev/null +++ b/terraform-ci-infra/1n_nmd/exporter/conf/nomad/exporter.hcl @@ -0,0 +1,587 @@ +job "${job_name}" { + # The "region" parameter specifies the region in which to execute the job. + # If omitted, this inherits the default region name of "global". + # region = "global" + # + # The "datacenters" parameter specifies the list of datacenters which should + # be considered when placing this task. This must be provided. + datacenters = "${datacenters}" + + # The "type" parameter controls the type of job, which impacts the scheduler's + # decision on placement. This configuration is optional and defaults to + # "service". For a full list of job types and their differences, please see + # the online documentation. + # + # https://www.nomadproject.io/docs/jobspec/schedulers + # + type = "system" + + update { + # The "max_parallel" parameter specifies the maximum number of updates to + # perform in parallel. In this case, this specifies to update a single task + # at a time. + max_parallel = 1 + + health_check = "checks" + + # The "min_healthy_time" parameter specifies the minimum time the allocation + # must be in the healthy state before it is marked as healthy and unblocks + # further allocations from being updated. + min_healthy_time = "10s" + + # The "healthy_deadline" parameter specifies the deadline in which the + # allocation must be marked as healthy after which the allocation is + # automatically transitioned to unhealthy. Transitioning to unhealthy will + # fail the deployment and potentially roll back the job if "auto_revert" is + # set to true. + healthy_deadline = "3m" + + # The "progress_deadline" parameter specifies the deadline in which an + # allocation must be marked as healthy. The deadline begins when the first + # allocation for the deployment is created and is reset whenever an allocation + # as part of the deployment transitions to a healthy state. If no allocation + # transitions to the healthy state before the progress deadline, the + # deployment is marked as failed. + progress_deadline = "10m" + +%{ if use_canary } + # The "canary" parameter specifies that changes to the job that would result + # in destructive updates should create the specified number of canaries + # without stopping any previous allocations. Once the operator determines the + # canaries are healthy, they can be promoted which unblocks a rolling update + # of the remaining allocations at a rate of "max_parallel". + # + # Further, setting "canary" equal to the count of the task group allows + # blue/green deployments. When the job is updated, a full set of the new + # version is deployed and upon promotion the old version is stopped. + canary = 1 + + # Specifies if the job should auto-promote to the canary version when all + # canaries become healthy during a deployment. Defaults to false which means + # canaries must be manually updated with the nomad deployment promote + # command. + auto_promote = true + + # The "auto_revert" parameter specifies if the job should auto-revert to the + # last stable job on deployment failure. A job is marked as stable if all the + # allocations as part of its deployment were marked healthy. + auto_revert = true +%{ endif } + } + + # The "group" stanza defines a series of tasks that should be co-located on + # the same Nomad client. Any task within a group will be placed on the same + # client. + # + # https://www.nomadproject.io/docs/job-specification/group + # + group "prod-group1-exporter-amd64" { + # The constraint allows restricting the set of eligible nodes. Constraints + # may filter on attributes or client metadata. + # + # https://www.nomadproject.io/docs/job-specification/constraint + # + constraint { + attribute = "$${attr.cpu.arch}" + operator = "!=" + value = "arm64" + } + + # The "task" stanza creates an individual unit of work, such as a Docker + # container, web application, or batch processing. + # + # https://www.nomadproject.io/docs/job-specification/task + # + task "prod-task1-${node_service_name}-amd64" { + # The "driver" parameter specifies the task driver that should be used to + # run the task. + driver = "raw_exec" + + # The "config" stanza specifies the driver configuration, which is passed + # directly to the driver to start the task. The details of configurations + # are specific to each driver, so please see specific driver + # documentation for more information. + config { + command = "local/node_exporter-${node_version}.linux-amd64/node_exporter" + } + + # The artifact stanza instructs Nomad to fetch and unpack a remote resource, + # such as a file, tarball, or binary. Nomad downloads artifacts using the + # popular go-getter library, which permits downloading artifacts from a + # variety of locations using a URL as the input source. + # + # https://www.nomadproject.io/docs/job-specification/artifact + # + artifact { + source = "${node_url_amd64}" + } + + # The service stanza instructs Nomad to register a service with Consul. + # + # https://www.nomadproject.io/docs/job-specification/service + # + service { + name = "${node_service_name}" + port = "${node_service_name}" + check { + name = "Node Exporter Check Live" + type = "http" + path = "/metrics" + interval = "10s" + timeout = "2s" + } + } + + # The "resources" stanza describes the requirements a task needs to + # execute. Resource requirements include memory, network, cpu, and more. + # This ensures the task will execute on a machine that contains enough + # resource capacity. + # + # https://www.nomadproject.io/docs/job-specification/resources + # + resources { + cpu = 500 + # The network stanza specifies the networking requirements for the task + # group, including the network mode and port allocations. When scheduling + # jobs in Nomad they are provisioned across your fleet of machines along + # with other jobs and services. Because you don't know in advance what host + # your job will be provisioned on, Nomad will provide your tasks with + # network configuration when they start up. + # + # https://www.nomadproject.io/docs/job-specification/network + # + network { + port "${node_service_name}" { + static = ${node_port} + } + } + } + } + task "prod-task2-${blackbox_service_name}-amd64" { + # The "driver" parameter specifies the task driver that should be used to + # run the task. + driver = "exec" + + # The "config" stanza specifies the driver configuration, which is passed + # directly to the driver to start the task. The details of configurations + # are specific to each driver, so please see specific driver + # documentation for more information. + config { + command = "local/blackbox_exporter-${blackbox_version}.linux-amd64/blackbox_exporter" + args = [ + "--config.file=secrets/blackbox.yml" + ] + } + + # The "template" stanza instructs Nomad to manage a template, such as + # a configuration file or script. This template can optionally pull data + # from Consul or Vault to populate runtime configuration data. + # + # https://www.nomadproject.io/docs/job-specification/template + # + template { + change_mode = "noop" + change_signal = "SIGINT" + destination = "secrets/blackbox.yml" + data = < 0", + "instant": false, + "legendFormat": "RTT", + "refId": "A" + }, + { + "expr": "1-avg_over_time(probe_success{instance=~\"$instance\"}[$__interval])", + "format": "time_series", + "legendFormat": "packet loss", + "refId": "B" + } + ], + "thresholds": [], + "timeFrom": null, + "timeRegions": [], + "timeShift": null, + "title": "ICMP round trip time ($instance)", + "tooltip": { + "shared": true, + "sort": 1, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "decimals": null, + "format": "dtdurations", + "label": "RTT", + "logBase": 1, + "max": null, + "min": "0", + "show": true + }, + { + "format": "percentunit", + "label": "packet loss", + "logBase": 1, + "max": "1", + "min": "0.0001", + "show": true + } + ], + "yaxis": { + "align": false, + "alignLevel": null + } + } + ], + "refresh": false, + "schemaVersion": 21, + "style": "dark", + "tags": [], + "templating": { + "list": [ + { + "current": { + "selected": false, + "text": "default", + "value": "default" + }, + "hide": 0, + "includeAll": false, + "label": "datasource", + "multi": false, + "name": "DS_PROMETHEUS", + "options": [], + "query": "prometheus", + "refresh": 1, + "regex": "", + "skipUrlSync": false, + "type": "datasource" + }, + { + "allValue": null, + "current": {}, + "datasource": "${DS_PROMETHEUS}", + "definition": "label_values(probe_success, instance)", + "hide": 0, + "includeAll": true, + "label": null, + "multi": true, + "name": "instance", + "options": [], + "query": "label_values(probe_success, instance)", + "refresh": 2, + "regex": "", + "skipUrlSync": false, + "sort": 0, + "tagValuesQuery": "", + "tags": [], + "tagsQuery": "", + "type": "query", + "useTags": false + } + ] + }, + "time": { + "from": "now-4h", + "to": "now" + }, + "timepicker": { + "refresh_intervals": [ + "5s", + "10s", + "30s", + "1m", + "5m", + "15m", + "30m", + "1h", + "2h", + "1d" + ] + }, + "timezone": "", + "title": "ICMP exporter", + "version": 1, + "description": "Graph ICMP metrics from the blackbox exporter, Smokeping-style" +} \ No newline at end of file diff --git a/terraform-ci-infra/1n_nmd/grafana/conf/consul.json b/terraform-ci-infra/1n_nmd/grafana/conf/consul.json new file mode 100644 index 0000000000..2e4a36f076 --- /dev/null +++ b/terraform-ci-infra/1n_nmd/grafana/conf/consul.json @@ -0,0 +1,1438 @@ +{ + "__inputs": [ + { + "name": "DS_PROMETHEUS", + "label": "Prometheus", + "description": "", + "type": "datasource", + "pluginId": "prometheus", + "pluginName": "Prometheus" + } + ], + "__requires": [ + { + "type": "grafana", + "id": "grafana", + "name": "Grafana", + "version": "4.3.0-beta1" + }, + { + "type": "panel", + "id": "graph", + "name": "Graph", + "version": "" + }, + { + "type": "datasource", + "id": "prometheus", + "name": "Prometheus", + "version": "1.0.0" + }, + { + "type": "panel", + "id": "singlestat", + "name": "Singlestat", + "version": "" + } + ], + "annotations": { + "list": [] + }, + "editable": true, + "gnetId": 2351, + "graphTooltip": 0, + "hideControls": false, + "id": null, + "links": [], + "rows": [ + { + "collapse": false, + "height": 153, + "panels": [ + { + "cacheTimeout": null, + "colorBackground": false, + "colorValue": false, + "colors": [ + "rgba(245, 54, 54, 0.9)", + "rgba(237, 129, 40, 0.89)", + "rgba(50, 172, 45, 0.97)" + ], + "datasource": "${DS_PROMETHEUS}", + "format": "none", + "gauge": { + "maxValue": 100, + "minValue": 0, + "show": false, + "thresholdLabels": false, + "thresholdMarkers": true + }, + "id": 1, + "interval": null, + "links": [], + "mappingType": 1, + "mappingTypes": [ + { + "name": "value to text", + "value": 1 + }, + { + "name": "range to text", + "value": 2 + } + ], + "maxDataPoints": 100, + "nullPointMode": "connected", + "nullText": null, + "postfix": "", + "postfixFontSize": "50%", + "prefix": "", + "prefixFontSize": "50%", + "rangeMaps": [ + { + "from": "null", + "text": "N/A", + "to": "null" + } + ], + "span": 2, + "sparkline": { + "fillColor": "rgba(31, 118, 189, 0.18)", + "full": false, + "lineColor": "rgb(31, 120, 193)", + "show": false + }, + "tableColumn": "", + "targets": [ + { + "expr": "consul_raft_leader_lastcontact_count", + "format": "time_series", + "interval": "", + "intervalFactor": 2, + "legendFormat": "{{host}}", + "refId": "A", + "step": 60 + } + ], + "thresholds": "", + "title": "Consul Leader", + "type": "singlestat", + "valueFontSize": "80%", + "valueMaps": [ + { + "op": "=", + "text": "N/A", + "value": "null" + } + ], + "valueName": "name" + }, + { + "cacheTimeout": null, + "colorBackground": false, + "colorValue": false, + "colors": [ + "rgba(245, 54, 54, 0.9)", + "rgba(237, 129, 40, 0.89)", + "rgba(50, 172, 45, 0.97)" + ], + "datasource": "${DS_PROMETHEUS}", + "format": "none", + "gauge": { + "maxValue": 3, + "minValue": 0, + "show": true, + "thresholdLabels": false, + "thresholdMarkers": true + }, + "id": 17, + "interval": null, + "links": [], + "mappingType": 1, + "mappingTypes": [ + { + "name": "value to text", + "value": 1 + }, + { + "name": "range to text", + "value": 2 + } + ], + "maxDataPoints": 100, + "nullPointMode": "connected", + "nullText": null, + "postfix": "", + "postfixFontSize": "50%", + "prefix": "", + "prefixFontSize": "50%", + "rangeMaps": [ + { + "from": "null", + "text": "N/A", + "to": "null" + } + ], + "span": 2, + "sparkline": { + "fillColor": "rgba(31, 118, 189, 0.18)", + "full": false, + "lineColor": "rgb(31, 120, 193)", + "show": false + }, + "tableColumn": "", + "targets": [ + { + "expr": "COUNT (changes(consul_memberlist_gossep_sum[1m]) > 0) BY (labels)", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "", + "refId": "A", + "step": 60 + } + ], + "thresholds": "1,2", + "title": "# servers in cluster", + "type": "singlestat", + "valueFontSize": "100%", + "valueMaps": [ + { + "op": "=", + "text": "N/A", + "value": "null" + } + ], + "valueName": "current" + }, + { + "cacheTimeout": null, + "colorBackground": false, + "colorValue": false, + "colors": [ + "rgba(245, 54, 54, 0.9)", + "rgba(237, 129, 40, 0.89)", + "rgba(50, 172, 45, 0.97)" + ], + "datasource": "${DS_PROMETHEUS}", + "decimals": null, + "format": "percent", + "gauge": { + "maxValue": 100, + "minValue": 0, + "show": true, + "thresholdLabels": false, + "thresholdMarkers": true + }, + "id": 18, + "interval": null, + "links": [], + "mappingType": 1, + "mappingTypes": [ + { + "name": "value to text", + "value": 1 + }, + { + "name": "range to text", + "value": 2 + } + ], + "maxDataPoints": 100, + "nullPointMode": "connected", + "nullText": null, + "postfix": "", + "postfixFontSize": "50%", + "prefix": "", + "prefixFontSize": "50%", + "rangeMaps": [ + { + "from": "null", + "text": "N/A", + "to": "null" + } + ], + "span": 2, + "sparkline": { + "fillColor": "rgba(31, 118, 189, 0.18)", + "full": false, + "lineColor": "rgb(31, 120, 193)", + "show": false + }, + "tableColumn": "", + "targets": [ + { + "expr": "sum(irate(node_cpu{mode=\"idle\", host=\"$consul\"}[1m])) * 100 / count_scalar(node_cpu{mode=\"user\", host=\"$consul\"})", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "", + "refId": "A", + "step": 60 + } + ], + "thresholds": "", + "title": "CPU Idle", + "type": "singlestat", + "valueFontSize": "80%", + "valueMaps": [ + { + "op": "=", + "text": "N/A", + "value": "null" + } + ], + "valueName": "avg" + }, + { + "cacheTimeout": null, + "colorBackground": false, + "colorValue": false, + "colors": [ + "rgba(50, 172, 45, 0.97)", + "rgba(237, 129, 40, 0.89)", + "rgba(245, 54, 54, 0.9)" + ], + "datasource": "${DS_PROMETHEUS}", + "format": "none", + "gauge": { + "maxValue": 4, + "minValue": 0, + "show": true, + "thresholdLabels": false, + "thresholdMarkers": true + }, + "id": 14, + "interval": null, + "links": [], + "mappingType": 1, + "mappingTypes": [ + { + "name": "value to text", + "value": 1 + }, + { + "name": "range to text", + "value": 2 + } + ], + "maxDataPoints": 100, + "nullPointMode": "connected", + "nullText": null, + "postfix": "", + "postfixFontSize": "50%", + "prefix": "", + "prefixFontSize": "50%", + "rangeMaps": [ + { + "from": "null", + "text": "N/A", + "to": "null" + } + ], + "span": 2, + "sparkline": { + "fillColor": "rgba(31, 118, 189, 0.18)", + "full": false, + "lineColor": "rgb(31, 120, 193)", + "show": false + }, + "tableColumn": "", + "targets": [ + { + "expr": "node_load1{host=\"$consul\"}", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "", + "refId": "A", + "step": 60 + } + ], + "thresholds": "1,2", + "title": "Load 1", + "type": "singlestat", + "valueFontSize": "80%", + "valueMaps": [ + { + "op": "=", + "text": "N/A", + "value": "null" + } + ], + "valueName": "avg" + }, + { + "cacheTimeout": null, + "colorBackground": false, + "colorValue": false, + "colors": [ + "rgba(50, 172, 45, 0.97)", + "rgba(237, 129, 40, 0.89)", + "rgba(245, 54, 54, 0.9)" + ], + "datasource": "${DS_PROMETHEUS}", + "format": "none", + "gauge": { + "maxValue": 4, + "minValue": 0, + "show": true, + "thresholdLabels": false, + "thresholdMarkers": true + }, + "id": 15, + "interval": null, + "links": [], + "mappingType": 1, + "mappingTypes": [ + { + "name": "value to text", + "value": 1 + }, + { + "name": "range to text", + "value": 2 + } + ], + "maxDataPoints": 100, + "nullPointMode": "connected", + "nullText": null, + "postfix": "", + "postfixFontSize": "50%", + "prefix": "", + "prefixFontSize": "50%", + "rangeMaps": [ + { + "from": "null", + "text": "N/A", + "to": "null" + } + ], + "span": 2, + "sparkline": { + "fillColor": "rgba(31, 118, 189, 0.18)", + "full": false, + "lineColor": "rgb(31, 120, 193)", + "show": false + }, + "tableColumn": "", + "targets": [ + { + "expr": "node_load5{host=\"$consul\"}", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "", + "refId": "A", + "step": 60 + } + ], + "thresholds": "1,2", + "title": "Load 5", + "type": "singlestat", + "valueFontSize": "80%", + "valueMaps": [ + { + "op": "=", + "text": "N/A", + "value": "null" + } + ], + "valueName": "avg" + }, + { + "cacheTimeout": null, + "colorBackground": false, + "colorValue": false, + "colors": [ + "rgba(50, 172, 45, 0.97)", + "rgba(237, 129, 40, 0.89)", + "rgba(245, 54, 54, 0.9)" + ], + "datasource": "${DS_PROMETHEUS}", + "format": "none", + "gauge": { + "maxValue": 4, + "minValue": 0, + "show": true, + "thresholdLabels": false, + "thresholdMarkers": true + }, + "id": 16, + "interval": null, + "links": [], + "mappingType": 1, + "mappingTypes": [ + { + "name": "value to text", + "value": 1 + }, + { + "name": "range to text", + "value": 2 + } + ], + "maxDataPoints": 100, + "nullPointMode": "connected", + "nullText": null, + "postfix": "", + "postfixFontSize": "50%", + "prefix": "", + "prefixFontSize": "50%", + "rangeMaps": [ + { + "from": "null", + "text": "N/A", + "to": "null" + } + ], + "span": 2, + "sparkline": { + "fillColor": "rgba(31, 118, 189, 0.18)", + "full": false, + "lineColor": "rgb(31, 120, 193)", + "show": false + }, + "tableColumn": "", + "targets": [ + { + "expr": "node_load15{host=\"$consul\"}", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "", + "refId": "A", + "step": 60 + } + ], + "thresholds": "1,2", + "title": "Load 15", + "type": "singlestat", + "valueFontSize": "80%", + "valueMaps": [ + { + "op": "=", + "text": "N/A", + "value": "null" + } + ], + "valueName": "avg" + } + ], + "repeat": null, + "repeatIteration": null, + "repeatRowId": null, + "showTitle": false, + "title": "Dashboard Row", + "titleSize": "h6" + }, + { + "collapse": false, + "height": 250, + "panels": [ + { + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "${DS_PROMETHEUS}", + "description": "The amount of TCP messages that are sent/received from the server.", + "fill": 1, + "id": 3, + "legend": { + "avg": false, + "current": false, + "max": false, + "min": false, + "show": true, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 1, + "links": [], + "nullPointMode": "null", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [], + "spaceLength": 10, + "span": 6, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "irate(consul_memberlist_tcp{host=\"$consul\"}[1m])", + "format": "time_series", + "interval": "", + "intervalFactor": 2, + "legendFormat": "{{type}}", + "refId": "A", + "step": 10 + } + ], + "thresholds": [], + "timeFrom": null, + "timeShift": null, + "title": "Memberlist TCP Messages", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + } + ] + }, + { + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "${DS_PROMETHEUS}", + "description": "The amount of UDP messages that are sent/received from the server.", + "fill": 1, + "id": 5, + "legend": { + "avg": false, + "current": false, + "max": false, + "min": false, + "show": true, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 1, + "links": [], + "nullPointMode": "null", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [], + "spaceLength": 10, + "span": 6, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "irate(consul_memberlist_udp{host=\"$consul\"}[1m])", + "format": "time_series", + "interval": "", + "intervalFactor": 2, + "legendFormat": "{{type}}", + "refId": "A", + "step": 10 + } + ], + "thresholds": [], + "timeFrom": null, + "timeShift": null, + "title": "Memberlist UDP Messages", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "format": "none", + "label": null, + "logBase": 1, + "max": null, + "min": "0", + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + } + ] + } + ], + "repeat": null, + "repeatIteration": null, + "repeatRowId": null, + "showTitle": false, + "title": "Dashboard Row", + "titleSize": "h6" + }, + { + "collapse": false, + "height": 250, + "panels": [ + { + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "${DS_PROMETHEUS}", + "description": "This measures the time it takes to replicate log entries to followers. This is a general indicator of the load pressure on the Consul servers, as well as the performance of the communication between the servers.", + "fill": 1, + "id": 6, + "legend": { + "avg": false, + "current": false, + "max": false, + "min": false, + "show": true, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 1, + "links": [], + "nullPointMode": "null", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [], + "spaceLength": 10, + "span": 6, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "consul_raft_replication_appendEntries_rpc", + "format": "time_series", + "interval": "", + "intervalFactor": 2, + "legendFormat": "{{query}} - {{quantile}}%", + "refId": "A", + "step": 10 + } + ], + "thresholds": [], + "timeFrom": null, + "timeShift": null, + "title": "Log replication from leader to servers", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "format": "ms", + "label": null, + "logBase": 1, + "max": null, + "min": "0", + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + } + ] + }, + { + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "${DS_PROMETHEUS}", + "fill": 1, + "id": 7, + "legend": { + "avg": false, + "current": false, + "max": false, + "min": false, + "show": true, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 1, + "links": [], + "nullPointMode": "null", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [], + "spaceLength": 10, + "span": 6, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "consul_raft_replication_heartbeat", + "format": "time_series", + "interval": "", + "intervalFactor": 2, + "legendFormat": "{{query}} - {{quantile}}%", + "refId": "A", + "step": 10 + } + ], + "thresholds": [], + "timeFrom": null, + "timeShift": null, + "title": "consul_raft_replication_heartbeat", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "format": "ms", + "label": null, + "logBase": 1, + "max": null, + "min": "0", + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + } + ] + } + ], + "repeat": null, + "repeatIteration": null, + "repeatRowId": null, + "showTitle": false, + "title": "Dashboard Row", + "titleSize": "h6" + }, + { + "collapse": false, + "height": 250, + "panels": [ + { + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "${DS_PROMETHEUS}", + "description": "This measures the time it takes for the leader to write log entries to disk.", + "fill": 1, + "id": 8, + "legend": { + "avg": false, + "current": false, + "max": false, + "min": false, + "show": true, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 1, + "links": [], + "nullPointMode": "null", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [], + "spaceLength": 10, + "span": 6, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "consul_raft_leader_dispatchLog", + "format": "time_series", + "interval": "", + "intervalFactor": 2, + "legendFormat": "{{quantile}}%", + "refId": "A", + "step": 10 + } + ], + "thresholds": [], + "timeFrom": null, + "timeShift": null, + "title": "Write logs", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "format": "ms", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + } + ] + }, + { + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "${DS_PROMETHEUS}", + "description": "This measures the time it takes to commit a new entry to the Raft log on the leader.", + "fill": 1, + "id": 4, + "legend": { + "avg": false, + "current": false, + "max": false, + "min": false, + "show": true, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 1, + "links": [], + "nullPointMode": "null", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [], + "spaceLength": 10, + "span": 6, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "consul_raft_commitTime", + "format": "time_series", + "interval": "", + "intervalFactor": 2, + "legendFormat": "{{quantile}}%", + "refId": "A", + "step": 10 + } + ], + "thresholds": [], + "timeFrom": null, + "timeShift": null, + "title": "Commit time Leader", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "format": "ms", + "label": null, + "logBase": 1, + "max": null, + "min": "0", + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + } + ] + } + ], + "repeat": null, + "repeatIteration": null, + "repeatRowId": null, + "showTitle": false, + "title": "Dashboard Row", + "titleSize": "h6" + }, + { + "collapse": false, + "height": 250, + "panels": [ + { + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "${DS_PROMETHEUS}", + "description": "This counts the number of Raft transactions occurring over the interval, which is a general indicator of the write load on the Consul servers.", + "fill": 1, + "id": 9, + "legend": { + "avg": false, + "current": false, + "max": false, + "min": false, + "show": true, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 1, + "links": [], + "nullPointMode": "null", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [], + "spaceLength": 10, + "span": 6, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "delta(consul_raft_apply[30s])", + "format": "time_series", + "interval": "", + "intervalFactor": 2, + "legendFormat": "Transactions", + "refId": "A", + "step": 10 + } + ], + "thresholds": [], + "timeFrom": null, + "timeShift": null, + "title": "Raft Transactions", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "format": "ops", + "label": null, + "logBase": 1, + "max": null, + "min": "0", + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + } + ] + }, + { + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "${DS_PROMETHEUS}", + "description": "This will only be emitted by the Raft leader and measures the time since the leader was last able to contact the follower nodes when checking its leader lease. It can be used as a measure for how stable the Raft timing is and how close the leader is to timing out its lease.\n\nThe lease timeout is 500 ms times the raft_multiplier configuration, so this telemetry value should not be getting close to that configured value, otherwise the Raft timing is marginal and might need to be tuned, or more powerful servers might be needed. See the Server Performance guide for more details.", + "fill": 1, + "id": 10, + "legend": { + "avg": false, + "current": false, + "max": false, + "min": false, + "show": true, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 1, + "links": [], + "nullPointMode": "null", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [], + "spaceLength": 10, + "span": 6, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "consul_raft_leader_lastcontact", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "{{quantile}}%", + "refId": "A", + "step": 10 + } + ], + "thresholds": [], + "timeFrom": null, + "timeShift": null, + "title": "Leader lastContact", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "format": "ms", + "label": null, + "logBase": 1, + "max": null, + "min": "0", + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + } + ] + } + ], + "repeat": null, + "repeatIteration": null, + "repeatRowId": null, + "showTitle": false, + "title": "Dashboard Row", + "titleSize": "h6" + }, + { + "collapse": false, + "height": 250, + "panels": [ + { + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "${DS_PROMETHEUS}", + "fill": 1, + "id": 12, + "legend": { + "avg": false, + "current": false, + "max": false, + "min": false, + "show": true, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 1, + "links": [], + "nullPointMode": "null", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [], + "spaceLength": 10, + "span": 6, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "delta(consul_rpc_query{host=\"$consul\"}[30s])", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "Requests", + "refId": "A", + "step": 10 + } + ], + "thresholds": [], + "timeFrom": null, + "timeShift": null, + "title": "RPC Requests", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + } + ] + }, + { + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "${DS_PROMETHEUS}", + "description": "Consul uses a network tomography system to compute network coordinates for nodes in the cluster. These coordinates allow the network round trip time to be estimated between any two nodes using a very simple calculation. This allows for many useful applications, such as finding the service node nearest a requesting node, or failing over to services in the next closest datacenter.", + "fill": 1, + "id": 13, + "legend": { + "avg": false, + "current": false, + "max": false, + "min": false, + "show": true, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 1, + "links": [], + "nullPointMode": "null", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [], + "spaceLength": 10, + "span": 6, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "consul_serf_coordinate_adjustment_ms{host=\"$consul\"}", + "format": "time_series", + "interval": "", + "intervalFactor": 2, + "legendFormat": "{{quantile}}%", + "refId": "A", + "step": 10 + } + ], + "thresholds": [], + "timeFrom": null, + "timeShift": null, + "title": "Serf Coordinates", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "format": "ms", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + } + ] + } + ], + "repeat": null, + "repeatIteration": null, + "repeatRowId": null, + "showTitle": false, + "title": "Dashboard Row", + "titleSize": "h6" + } + ], + "schemaVersion": 14, + "style": "dark", + "tags": [], + "templating": { + "list": [ + { + "current": { + "selected": false, + "text": "default", + "value": "default" + }, + "hide": 0, + "includeAll": false, + "label": "datasource", + "multi": false, + "name": "DS_PROMETHEUS", + "options": [], + "query": "prometheus", + "refresh": 1, + "regex": "", + "skipUrlSync": false, + "type": "datasource" + }, + { + "allValue": null, + "current": {}, + "datasource": "${DS_PROMETHEUS}", + "hide": 0, + "includeAll": false, + "label": null, + "multi": false, + "name": "consul", + "options": [], + "query": "label_values(consul_memberlist_gossep_sum, host)", + "refresh": 1, + "regex": "", + "sort": 0, + "tagValuesQuery": "", + "tags": [], + "tagsQuery": "", + "type": "query", + "useTags": false + } + ] + }, + "time": { + "from": "now-4h", + "to": "now" + }, + "timepicker": { + "refresh_intervals": [ + "5s", + "10s", + "30s", + "1m", + "5m", + "15m", + "30m", + "1h", + "2h", + "1d" + ], + "time_options": [ + "5m", + "15m", + "1h", + "6h", + "12h", + "24h", + "2d", + "7d", + "30d" + ] + }, + "timezone": "browser", + "title": "Consul", + "version": 1 +} \ No newline at end of file diff --git a/terraform-ci-infra/1n_nmd/grafana/conf/docker_cadvisor.json b/terraform-ci-infra/1n_nmd/grafana/conf/docker_cadvisor.json new file mode 100644 index 0000000000..bbad614bb4 --- /dev/null +++ b/terraform-ci-infra/1n_nmd/grafana/conf/docker_cadvisor.json @@ -0,0 +1,2040 @@ +{ + "__inputs": [ + { + "name": "DS_PROMETHEUS", + "label": "Prometheus", + "description": "", + "type": "datasource", + "pluginId": "prometheus", + "pluginName": "Prometheus" + } + ], + "__requires": [ + { + "type": "grafana", + "id": "grafana", + "name": "Grafana", + "version": "6.2.4" + }, + { + "type": "panel", + "id": "graph", + "name": "Graph", + "version": "" + }, + { + "type": "datasource", + "id": "prometheus", + "name": "Prometheus", + "version": "1.0.0" + }, + { + "type": "panel", + "id": "singlestat", + "name": "Singlestat", + "version": "" + }, + { + "type": "panel", + "id": "table", + "name": "Table", + "version": "" + } + ], + "annotations": { + "list": [ + { + "builtIn": 1, + "datasource": "-- Grafana --", + "enable": true, + "hide": true, + "iconColor": "rgba(0, 211, 255, 1)", + "name": "Annotations & Alerts", + "type": "dashboard" + } + ] + }, + "description": "A simple overview of the most important Docker host and container metrics. (cAdvisor/Prometheus)", + "editable": true, + "gnetId": 10657, + "graphTooltip": 1, + "id": null, + "iteration": 1564715574785, + "links": [], + "panels": [ + { + "cacheTimeout": null, + "colorBackground": false, + "colorValue": false, + "colors": [ + "rgba(245, 54, 54, 0.9)", + "rgba(237, 129, 40, 0.89)", + "rgba(50, 172, 45, 0.97)" + ], + "datasource": "${DS_PROMETHEUS}", + "decimals": 0, + "editable": true, + "error": false, + "format": "s", + "gauge": { + "maxValue": 100, + "minValue": 0, + "show": false, + "thresholdLabels": false, + "thresholdMarkers": true + }, + "gridPos": { + "h": 4, + "w": 4, + "x": 0, + "y": 0 + }, + "height": "", + "id": 24, + "interval": null, + "links": [], + "mappingType": 1, + "mappingTypes": [ + { + "name": "value to text", + "value": 1 + }, + { + "name": "range to text", + "value": 2 + } + ], + "maxDataPoints": 100, + "nullPointMode": "connected", + "nullText": null, + "options": {}, + "postfix": "", + "postfixFontSize": "30%", + "prefix": "", + "prefixFontSize": "20%", + "rangeMaps": [ + { + "from": "null", + "text": "N/A", + "to": "null" + } + ], + "sparkline": { + "fillColor": "rgba(31, 118, 189, 0.18)", + "full": false, + "lineColor": "rgb(31, 120, 193)", + "show": false + }, + "tableColumn": "", + "targets": [ + { + "expr": "time() - node_boot_time_seconds{instance=~\"$node:.*\"}", + "format": "time_series", + "hide": false, + "intervalFactor": 2, + "legendFormat": "", + "refId": "A", + "step": 1800 + } + ], + "thresholds": "", + "title": "Uptime", + "type": "singlestat", + "valueFontSize": "80%", + "valueMaps": [ + { + "op": "=", + "text": "N/A", + "value": "null" + } + ], + "valueName": "current" + }, + { + "cacheTimeout": null, + "colorBackground": false, + "colorValue": false, + "colors": [ + "rgba(245, 54, 54, 0.9)", + "rgba(237, 129, 40, 0.89)", + "rgba(50, 172, 45, 0.97)" + ], + "datasource": "${DS_PROMETHEUS}", + "editable": true, + "error": false, + "format": "none", + "gauge": { + "maxValue": 100, + "minValue": 0, + "show": false, + "thresholdLabels": false, + "thresholdMarkers": true + }, + "gridPos": { + "h": 4, + "w": 4, + "x": 4, + "y": 0 + }, + "id": 31, + "interval": null, + "links": [], + "mappingType": 1, + "mappingTypes": [ + { + "name": "value to text", + "value": 1 + }, + { + "name": "range to text", + "value": 2 + } + ], + "maxDataPoints": 100, + "nullPointMode": "connected", + "nullText": null, + "options": {}, + "postfix": "", + "postfixFontSize": "50%", + "prefix": "", + "prefixFontSize": "50%", + "rangeMaps": [ + { + "from": "null", + "text": "N/A", + "to": "null" + } + ], + "sparkline": { + "fillColor": "rgba(31, 118, 189, 0.18)", + "full": false, + "lineColor": "rgb(31, 120, 193)", + "show": false + }, + "tableColumn": "", + "targets": [ + { + "expr": "count(container_last_seen{instance=~\"$node:$port\",job=~\"$job\",image!=\"\"})", + "format": "time_series", + "intervalFactor": 2, + "refId": "A", + "step": 1800 + } + ], + "thresholds": "", + "title": "Containers", + "type": "singlestat", + "valueFontSize": "120%", + "valueMaps": [ + { + "op": "=", + "text": "N/A", + "value": "null" + } + ], + "valueName": "current" + }, + { + "cacheTimeout": null, + "colorBackground": false, + "colorValue": false, + "colors": [ + "rgba(50, 172, 45, 0.97)", + "rgba(237, 129, 40, 0.89)", + "rgba(245, 54, 54, 0.9)" + ], + "datasource": "${DS_PROMETHEUS}", + "decimals": 0, + "editable": true, + "error": false, + "format": "decbytes", + "gauge": { + "maxValue": 500000000, + "minValue": 0, + "show": true, + "thresholdLabels": false, + "thresholdMarkers": true + }, + "gridPos": { + "h": 4, + "w": 4, + "x": 8, + "y": 0 + }, + "id": 30, + "interval": null, + "links": [], + "mappingType": 1, + "mappingTypes": [ + { + "name": "value to text", + "value": 1 + }, + { + "name": "range to text", + "value": 2 + } + ], + "maxDataPoints": 100, + "nullPointMode": "connected", + "nullText": null, + "options": {}, + "postfix": "", + "postfixFontSize": "50%", + "prefix": "", + "prefixFontSize": "50%", + "rangeMaps": [ + { + "from": "null", + "text": "N/A", + "to": "null" + } + ], + "sparkline": { + "fillColor": "rgba(31, 118, 189, 0.18)", + "full": false, + "lineColor": "rgb(31, 120, 193)", + "show": false + }, + "tableColumn": "", + "targets": [ + { + "expr": "(node_memory_SwapTotal_bytes{instance=~'$node:9100'} - node_memory_SwapFree_bytes{instance=~'$node:9100'})", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "", + "refId": "A", + "step": 1800 + } + ], + "thresholds": "400000000", + "title": "Swap", + "type": "singlestat", + "valueFontSize": "80%", + "valueMaps": [ + { + "op": "=", + "text": "N/A", + "value": "null" + } + ], + "valueName": "current" + }, + { + "cacheTimeout": null, + "colorBackground": false, + "colorValue": false, + "colors": [ + "rgba(245, 54, 54, 0.9)", + "rgba(237, 129, 40, 0.89)", + "rgba(50, 172, 45, 0.97)" + ], + "datasource": "${DS_PROMETHEUS}", + "decimals": 0, + "editable": true, + "error": false, + "format": "percentunit", + "gauge": { + "maxValue": 100, + "minValue": 0, + "show": false, + "thresholdLabels": false, + "thresholdMarkers": true + }, + "gridPos": { + "h": 4, + "w": 4, + "x": 12, + "y": 0 + }, + "id": 27, + "interval": null, + "links": [], + "mappingType": 1, + "mappingTypes": [ + { + "name": "value to text", + "value": 1 + }, + { + "name": "range to text", + "value": 2 + } + ], + "maxDataPoints": 100, + "nullPointMode": "connected", + "nullText": null, + "options": {}, + "postfix": "", + "postfixFontSize": "50%", + "prefix": "", + "prefixFontSize": "50%", + "rangeMaps": [ + { + "from": "null", + "text": "N/A", + "to": "null" + } + ], + "sparkline": { + "fillColor": "rgba(50, 189, 31, 0.18)", + "full": false, + "lineColor": "rgb(69, 193, 31)", + "show": true + }, + "tableColumn": "", + "targets": [ + { + "expr": "node_load1{instance=~\"$node:9100\"} / count by(job, instance)(count by(job, instance, cpu)(node_cpu_seconds_total{instance=~\"$node:9100\"}))", + "format": "time_series", + "intervalFactor": 2, + "refId": "A", + "step": 1800 + } + ], + "thresholds": "0.8,0.9", + "title": "Load", + "type": "singlestat", + "valueFontSize": "80%", + "valueMaps": [ + { + "op": "=", + "text": "N/A", + "value": "null" + } + ], + "valueName": "avg" + }, + { + "alert": { + "conditions": [ + { + "evaluator": { + "params": [ + 10000000000 + ], + "type": "gt" + }, + "query": { + "params": [ + "A", + "5m", + "now" + ] + }, + "reducer": { + "params": [], + "type": "avg" + }, + "type": "query" + } + ], + "executionErrorState": "alerting", + "frequency": "60s", + "handler": 1, + "name": "Available Memory alert", + "noDataState": "keep_state", + "notifications": [ + { + "id": 1 + } + ] + }, + "aliasColors": { + "Available Memory": "#7EB26D", + "Unavailable Memory": "#7EB26D" + }, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "${DS_PROMETHEUS}", + "editable": true, + "error": false, + "fill": 1, + "grid": {}, + "gridPos": { + "h": 10, + "w": 4, + "x": 16, + "y": 0 + }, + "id": 20, + "legend": { + "avg": false, + "current": false, + "max": false, + "min": false, + "show": false, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 1, + "links": [], + "nullPointMode": "null as zero", + "options": {}, + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [], + "spaceLength": 10, + "stack": true, + "steppedLine": false, + "targets": [ + { + "expr": "container_memory_rss{name=~\".+\"}", + "format": "time_series", + "hide": true, + "intervalFactor": 2, + "legendFormat": "{{__name__}}", + "refId": "D", + "step": 20 + }, + { + "expr": "sum(container_memory_rss{name=~\".+\"})", + "format": "time_series", + "hide": true, + "intervalFactor": 2, + "legendFormat": "{{__name__}}", + "refId": "A", + "step": 20 + }, + { + "expr": "container_memory_usage_bytes{name=~\".+\"}", + "format": "time_series", + "hide": true, + "intervalFactor": 2, + "legendFormat": "{{name}}", + "refId": "B", + "step": 20 + }, + { + "expr": "container_memory_rss{id=\"/\"}", + "format": "time_series", + "hide": true, + "intervalFactor": 2, + "legendFormat": "{{__name__}}", + "refId": "C", + "step": 20 + }, + { + "expr": "sum(container_memory_rss)", + "format": "time_series", + "hide": true, + "intervalFactor": 2, + "legendFormat": "{{__name__}}", + "refId": "E", + "step": 20 + }, + { + "expr": "node_memory_Buffers", + "format": "time_series", + "hide": true, + "intervalFactor": 2, + "legendFormat": "node_memory_Dirty", + "refId": "N", + "step": 30 + }, + { + "expr": "node_memory_MemFree", + "format": "time_series", + "hide": true, + "intervalFactor": 2, + "legendFormat": "{{__name__}}", + "refId": "F", + "step": 20 + }, + { + "expr": "node_memory_MemAvailable", + "format": "time_series", + "hide": true, + "intervalFactor": 2, + "legendFormat": "Available Memory", + "refId": "H", + "step": 20 + }, + { + "expr": "node_memory_MemTotal_bytes{instance=~\"$node:9100\"} - node_memory_MemAvailable_bytes{instance=~\"$node:9100\"}", + "format": "time_series", + "hide": false, + "intervalFactor": 2, + "legendFormat": "Unavailable Memory", + "refId": "G", + "step": 600 + }, + { + "expr": "node_memory_Inactive", + "format": "time_series", + "hide": true, + "intervalFactor": 2, + "legendFormat": "{{__name__}}", + "refId": "I", + "step": 30 + }, + { + "expr": "node_memory_KernelStack", + "format": "time_series", + "hide": true, + "intervalFactor": 2, + "legendFormat": "{{__name__}}", + "refId": "J", + "step": 30 + }, + { + "expr": "node_memory_Active", + "format": "time_series", + "hide": true, + "intervalFactor": 2, + "legendFormat": "{{__name__}}", + "refId": "K", + "step": 30 + }, + { + "expr": "node_memory_MemTotal - (node_memory_Active + node_memory_MemFree + node_memory_Inactive)", + "format": "time_series", + "hide": true, + "intervalFactor": 2, + "legendFormat": "Unknown", + "refId": "L", + "step": 40 + }, + { + "expr": "node_memory_MemFree + node_memory_Inactive ", + "format": "time_series", + "hide": true, + "intervalFactor": 2, + "legendFormat": "{{__name__}}", + "refId": "M", + "step": 30 + }, + { + "expr": "container_memory_rss{name=~\".+\"}", + "format": "time_series", + "hide": true, + "intervalFactor": 2, + "legendFormat": "{{__name__}}", + "refId": "O", + "step": 30 + }, + { + "expr": "node_memory_Inactive + node_memory_MemFree + node_memory_MemAvailable", + "format": "time_series", + "hide": true, + "intervalFactor": 2, + "legendFormat": "", + "refId": "P", + "step": 40 + } + ], + "thresholds": [ + { + "colorMode": "critical", + "fill": true, + "line": true, + "op": "gt", + "value": 10000000000, + "yaxis": "left" + } + ], + "timeFrom": null, + "timeRegions": [], + "timeShift": null, + "title": "Available Memory", + "tooltip": { + "msResolution": true, + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": false, + "values": [] + }, + "yaxes": [ + { + "format": "bytes", + "label": "", + "logBase": 1, + "max": 16000000000, + "min": 0, + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": false + } + ], + "yaxis": { + "align": false, + "alignLevel": null + } + }, + { + "alert": { + "conditions": [ + { + "evaluator": { + "params": [ + 850000000000 + ], + "type": "gt" + }, + "query": { + "params": [ + "A", + "5m", + "now" + ] + }, + "reducer": { + "params": [], + "type": "avg" + }, + "type": "query" + } + ], + "executionErrorState": "alerting", + "frequency": "60s", + "handler": 1, + "name": "Free/Used Disk Space alert", + "noDataState": "keep_state", + "notifications": [ + { + "id": 1 + } + ] + }, + "aliasColors": { + "Belegete Festplatte": "#BF1B00", + "Free Disk Space": "#7EB26D", + "Used Disk Space": "#7EB26D", + "{}": "#BF1B00" + }, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "${DS_PROMETHEUS}", + "editable": true, + "error": false, + "fill": 1, + "grid": {}, + "gridPos": { + "h": 10, + "w": 4, + "x": 20, + "y": 0 + }, + "id": 13, + "legend": { + "avg": false, + "current": false, + "max": false, + "min": false, + "show": false, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 1, + "links": [], + "nullPointMode": "null as zero", + "options": {}, + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [ + { + "alias": "Used Disk Space", + "yaxis": 1 + } + ], + "spaceLength": 10, + "stack": true, + "steppedLine": false, + "targets": [ + { + "expr": "node_filesystem_size_bytes{fstype=\"rootfs\"} - node_filesystem_free_bytes{fstype=\"rootfs\"}", + "format": "time_series", + "hide": false, + "intervalFactor": 2, + "legendFormat": "Used Disk Space", + "refId": "A", + "step": 600 + } + ], + "thresholds": [ + { + "colorMode": "critical", + "fill": true, + "line": true, + "op": "gt", + "value": 850000000000 + } + ], + "timeFrom": null, + "timeRegions": [], + "timeShift": null, + "title": "Used Disk Space", + "tooltip": { + "msResolution": true, + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": false, + "values": [] + }, + "yaxes": [ + { + "format": "bytes", + "label": "", + "logBase": 1, + "max": 1000000000000, + "min": 0, + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": false + } + ], + "yaxis": { + "align": false, + "alignLevel": null + } + }, + { + "aliasColors": { + "SENT": "#BF1B00" + }, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "${DS_PROMETHEUS}", + "editable": true, + "error": false, + "fill": 1, + "grid": {}, + "gridPos": { + "h": 6, + "w": 4, + "x": 0, + "y": 4 + }, + "id": 19, + "legend": { + "avg": false, + "current": false, + "max": false, + "min": false, + "show": false, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 1, + "links": [], + "nullPointMode": "null as zero", + "options": {}, + "percentage": false, + "pointradius": 1, + "points": false, + "renderer": "flot", + "seriesOverrides": [], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "sum(rate(container_network_receive_bytes_total{id=\"/\"}[$interval])) by (id)", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "RECEIVED", + "refId": "A", + "step": 600 + }, + { + "expr": "- sum(rate(container_network_transmit_bytes_total{id=\"/\"}[$interval])) by (id)", + "format": "time_series", + "hide": false, + "intervalFactor": 2, + "legendFormat": "SENT", + "refId": "B", + "step": 600 + } + ], + "thresholds": [], + "timeFrom": null, + "timeRegions": [], + "timeShift": null, + "title": "Network Traffic", + "tooltip": { + "msResolution": true, + "shared": true, + "sort": 0, + "value_type": "cumulative" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": false, + "values": [] + }, + "yaxes": [ + { + "format": "bytes", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": false + } + ], + "yaxis": { + "align": false, + "alignLevel": null + } + }, + { + "cacheTimeout": null, + "colorBackground": false, + "colorValue": false, + "colors": [ + "rgba(50, 172, 45, 0.97)", + "rgba(237, 129, 40, 0.89)", + "rgba(245, 54, 54, 0.9)" + ], + "datasource": "${DS_PROMETHEUS}", + "decimals": 0, + "editable": true, + "error": false, + "format": "percent", + "gauge": { + "maxValue": 100, + "minValue": 0, + "show": true, + "thresholdLabels": false, + "thresholdMarkers": true + }, + "gridPos": { + "h": 6, + "w": 4, + "x": 4, + "y": 4 + }, + "id": 25, + "interval": null, + "links": [], + "mappingType": 1, + "mappingTypes": [ + { + "name": "value to text", + "value": 1 + }, + { + "name": "range to text", + "value": 2 + } + ], + "maxDataPoints": 100, + "nullPointMode": "connected", + "nullText": null, + "options": {}, + "postfix": "", + "postfixFontSize": "50%", + "prefix": "", + "prefixFontSize": "50%", + "rangeMaps": [ + { + "from": "null", + "text": "N/A", + "to": "null" + } + ], + "sparkline": { + "fillColor": "rgba(31, 118, 189, 0.18)", + "full": false, + "lineColor": "rgb(31, 120, 193)", + "show": false + }, + "tableColumn": "", + "targets": [ + { + "expr": "((node_memory_MemTotal_bytes{instance=~\"$node:9100\"} - node_memory_MemAvailable_bytes{instance=~\"$node:9100\"}) / node_memory_MemTotal_bytes{instance=~\"$node:9100\"}) * 100", + "format": "time_series", + "intervalFactor": 2, + "refId": "A", + "step": 1800 + } + ], + "thresholds": "70, 90", + "title": "Memory", + "type": "singlestat", + "valueFontSize": "80%", + "valueMaps": [ + { + "op": "=", + "text": "N/A", + "value": "null" + } + ], + "valueName": "current" + }, + { + "aliasColors": { + "{id=\"/\",instance=\"cadvisor:8080\",job=\"prometheus\"}": "#BA43A9" + }, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "${DS_PROMETHEUS}", + "editable": true, + "error": false, + "fill": 1, + "grid": {}, + "gridPos": { + "h": 6, + "w": 4, + "x": 8, + "y": 4 + }, + "id": 5, + "legend": { + "avg": false, + "current": false, + "max": false, + "min": false, + "show": false, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 1, + "links": [], + "nullPointMode": "null as zero", + "options": {}, + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [], + "spaceLength": 10, + "stack": true, + "steppedLine": false, + "targets": [ + { + "expr": "sum(rate(container_cpu_system_seconds_total[1m]))", + "format": "time_series", + "hide": true, + "intervalFactor": 2, + "legendFormat": "a", + "refId": "B", + "step": 120 + }, + { + "expr": "sum(rate(container_cpu_system_seconds_total{name=~\".+\"}[1m]))", + "format": "time_series", + "hide": true, + "interval": "", + "intervalFactor": 2, + "legendFormat": "nur container", + "refId": "F", + "step": 10 + }, + { + "expr": "sum(rate(container_cpu_system_seconds_total{id=\"/\"}[1m]))", + "format": "time_series", + "hide": true, + "interval": "", + "intervalFactor": 2, + "legendFormat": "nur docker host", + "metric": "", + "refId": "A", + "step": 20 + }, + { + "expr": "sum(rate(process_cpu_seconds_total[$interval])) * 100", + "format": "time_series", + "hide": false, + "interval": "", + "intervalFactor": 2, + "legendFormat": "host", + "metric": "", + "refId": "C", + "step": 600 + }, + { + "expr": "sum(rate(container_cpu_system_seconds_total{name=~\".+\"}[1m])) + sum(rate(container_cpu_system_seconds_total{id=\"/\"}[1m])) + sum(rate(process_cpu_seconds_total[1m]))", + "format": "time_series", + "hide": true, + "intervalFactor": 2, + "legendFormat": "", + "refId": "D", + "step": 120 + } + ], + "thresholds": [], + "timeFrom": null, + "timeRegions": [], + "timeShift": null, + "title": "CPU Usage", + "tooltip": { + "msResolution": true, + "shared": true, + "sort": 0, + "value_type": "cumulative" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": false, + "values": [] + }, + "yaxes": [ + { + "format": "percent", + "label": "", + "logBase": 1, + "max": null, + "min": null, + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": false + } + ], + "yaxis": { + "align": false, + "alignLevel": null + } + }, + { + "aliasColors": { + "IN on /sda": "#7EB26D", + "OUT on /sda": "#890F02" + }, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "${DS_PROMETHEUS}", + "editable": true, + "error": false, + "fill": 1, + "grid": {}, + "gridPos": { + "h": 6, + "w": 4, + "x": 12, + "y": 4 + }, + "id": 3, + "legend": { + "avg": false, + "current": false, + "max": false, + "min": false, + "show": false, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 1, + "links": [], + "nullPointMode": "null as zero", + "options": {}, + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "-sum(rate(node_disk_read_bytes_total[$interval])) by (device)", + "format": "time_series", + "hide": false, + "intervalFactor": 2, + "legendFormat": "OUT on /{{device}}", + "metric": "node_disk_bytes_read", + "refId": "A", + "step": 600 + }, + { + "expr": "sum(rate(node_disk_written_bytes_total[$interval])) by (device)", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "IN on /{{device}}", + "metric": "", + "refId": "B", + "step": 600 + } + ], + "thresholds": [], + "timeFrom": null, + "timeRegions": [], + "timeShift": null, + "title": "Disk I/O", + "tooltip": { + "msResolution": true, + "shared": true, + "sort": 0, + "value_type": "cumulative" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": false, + "values": [] + }, + "yaxes": [ + { + "format": "Bps", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": false + } + ], + "yaxis": { + "align": false, + "alignLevel": null + } + }, + { + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "${DS_PROMETHEUS}", + "editable": true, + "error": false, + "fill": 1, + "grid": {}, + "gridPos": { + "h": 7, + "w": 12, + "x": 0, + "y": 10 + }, + "id": 8, + "legend": { + "alignAsTable": true, + "avg": false, + "current": false, + "max": false, + "min": false, + "rightSide": true, + "show": true, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 2, + "links": [], + "nullPointMode": "null as zero", + "options": {}, + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "sum(rate(container_network_receive_bytes_total{instance=~\"$node:$port\",job=~\"$job\",image!=\"\"}[$interval])) by (name)", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "{{name}}", + "refId": "A", + "step": 240 + }, + { + "expr": "- rate(container_network_transmit_bytes_total{name=~\".+\"}[$interval])", + "format": "time_series", + "hide": true, + "intervalFactor": 2, + "legendFormat": "{{name}}", + "refId": "B", + "step": 10 + } + ], + "thresholds": [], + "timeFrom": null, + "timeRegions": [], + "timeShift": null, + "title": "Received Network Traffic per Container", + "tooltip": { + "msResolution": true, + "shared": true, + "sort": 0, + "value_type": "cumulative" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "format": "Bps", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + } + ], + "yaxis": { + "align": false, + "alignLevel": null + } + }, + { + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "${DS_PROMETHEUS}", + "editable": true, + "error": false, + "fill": 1, + "grid": {}, + "gridPos": { + "h": 7, + "w": 12, + "x": 12, + "y": 10 + }, + "id": 9, + "legend": { + "alignAsTable": true, + "avg": false, + "current": false, + "hideEmpty": false, + "hideZero": false, + "max": false, + "min": false, + "rightSide": true, + "show": true, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 2, + "links": [], + "nullPointMode": "null as zero", + "options": {}, + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "sum(rate(container_network_transmit_bytes_total{instance=~\"$node:$port\",job=~\"$job\",image!=\"\"}[$interval])) by (name)", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "{{name}}", + "refId": "A", + "step": 240 + }, + { + "expr": "rate(container_network_transmit_bytes_total{id=\"/\"}[$interval])", + "format": "time_series", + "hide": true, + "intervalFactor": 2, + "legendFormat": "", + "refId": "B", + "step": 10 + } + ], + "thresholds": [], + "timeFrom": null, + "timeRegions": [], + "timeShift": null, + "title": "Sent Network Traffic per Container", + "tooltip": { + "msResolution": true, + "shared": true, + "sort": 0, + "value_type": "cumulative" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "format": "Bps", + "label": "", + "logBase": 1, + "max": null, + "min": null, + "show": true + }, + { + "format": "short", + "label": "", + "logBase": 10, + "max": 8, + "min": 0, + "show": false + } + ], + "yaxis": { + "align": false, + "alignLevel": null + } + }, + { + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "${DS_PROMETHEUS}", + "editable": true, + "error": false, + "fill": 5, + "grid": {}, + "gridPos": { + "h": 7, + "w": 12, + "x": 0, + "y": 17 + }, + "id": 1, + "legend": { + "alignAsTable": true, + "avg": false, + "current": false, + "max": false, + "min": false, + "rightSide": true, + "show": true, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 1, + "links": [], + "nullPointMode": "null as zero", + "options": {}, + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "sum(rate(container_cpu_usage_seconds_total{instance=~\"$node:$port\",job=~\"$job\",image!=\"\"}[$interval])) by (name) * 100", + "format": "time_series", + "hide": false, + "interval": "", + "intervalFactor": 2, + "legendFormat": "{{name}}", + "metric": "", + "refId": "F", + "step": 240 + } + ], + "thresholds": [], + "timeFrom": null, + "timeRegions": [], + "timeShift": null, + "title": "CPU Usage per Container", + "tooltip": { + "msResolution": true, + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "format": "percent", + "label": "", + "logBase": 1, + "max": null, + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": false + } + ], + "yaxis": { + "align": false, + "alignLevel": null + } + }, + { + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "${DS_PROMETHEUS}", + "editable": true, + "error": false, + "fill": 3, + "grid": {}, + "gridPos": { + "h": 7, + "w": 12, + "x": 12, + "y": 17 + }, + "id": 34, + "legend": { + "alignAsTable": true, + "avg": false, + "current": false, + "max": false, + "min": false, + "rightSide": true, + "show": true, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 2, + "links": [], + "nullPointMode": "null as zero", + "options": {}, + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [], + "spaceLength": 10, + "stack": true, + "steppedLine": false, + "targets": [ + { + "expr": "sum(container_memory_swap{instance=~\"$node:$port\",job=~\"$job\",image!=\"\"}) by (name)", + "format": "time_series", + "hide": false, + "intervalFactor": 2, + "legendFormat": "{{name}}", + "refId": "A", + "step": 240 + }, + { + "expr": "container_memory_usage_bytes{name=~\".+\"}", + "format": "time_series", + "hide": true, + "intervalFactor": 2, + "legendFormat": "{{name}}", + "refId": "B", + "step": 240 + } + ], + "thresholds": [], + "timeFrom": null, + "timeRegions": [], + "timeShift": null, + "title": "Memory Swap per Container", + "tooltip": { + "msResolution": true, + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "format": "bytes", + "label": "", + "logBase": 1, + "max": null, + "min": null, + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + } + ], + "yaxis": { + "align": false, + "alignLevel": null + } + }, + { + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "${DS_PROMETHEUS}", + "editable": true, + "error": false, + "fill": 3, + "grid": {}, + "gridPos": { + "h": 7, + "w": 12, + "x": 0, + "y": 24 + }, + "id": 10, + "legend": { + "alignAsTable": true, + "avg": false, + "current": false, + "max": false, + "min": false, + "rightSide": true, + "show": true, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 2, + "links": [], + "nullPointMode": "null as zero", + "options": {}, + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "sum(container_memory_rss{instance=~\"$node:$port\",job=~\"$job\",image!=\"\"}) by (name)", + "format": "time_series", + "hide": false, + "intervalFactor": 2, + "legendFormat": "{{name}}", + "refId": "A", + "step": 240 + }, + { + "expr": "container_memory_usage_bytes{name=~\".+\"}", + "format": "time_series", + "hide": true, + "intervalFactor": 2, + "legendFormat": "{{name}}", + "refId": "B", + "step": 240 + } + ], + "thresholds": [], + "timeFrom": null, + "timeRegions": [], + "timeShift": null, + "title": "Memory Usage per Container", + "tooltip": { + "msResolution": true, + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "format": "bytes", + "label": "", + "logBase": 1, + "max": null, + "min": null, + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + } + ], + "yaxis": { + "align": false, + "alignLevel": null + } + }, + { + "columns": [ + { + "text": "Current", + "value": "current" + } + ], + "editable": true, + "error": false, + "fontSize": "100%", + "gridPos": { + "h": 7, + "w": 12, + "x": 12, + "y": 24 + }, + "id": 36, + "links": [], + "options": {}, + "pageSize": null, + "scroll": true, + "showHeader": true, + "sort": { + "col": 0, + "desc": true + }, + "styles": [ + { + "colorMode": null, + "colors": [ + "rgba(245, 54, 54, 0.9)", + "rgba(237, 129, 40, 0.89)", + "rgba(50, 172, 45, 0.97)" + ], + "decimals": 2, + "pattern": "/.*/", + "thresholds": [ + "10000000", + " 25000000" + ], + "type": "number", + "unit": "decbytes" + } + ], + "targets": [ + { + "expr": "sum(container_spec_memory_limit_bytes{name=~\".+\"} - container_memory_usage_bytes{name=~\".+\"}) by (name) ", + "format": "table", + "hide": true, + "intervalFactor": 2, + "legendFormat": "{{name}}", + "metric": "", + "refId": "A", + "step": 240 + }, + { + "expr": "sum(container_spec_memory_limit_bytes{instance=~\"$node:$port\",job=~\"$job\",image!=\"\"}) by (name) ", + "format": "table", + "hide": false, + "intervalFactor": 2, + "legendFormat": "{{name}}", + "refId": "B", + "step": 240 + }, + { + "expr": "container_memory_usage_bytes{name=~\".+\"}", + "format": "table", + "hide": true, + "intervalFactor": 2, + "legendFormat": "{{name}}", + "refId": "C", + "step": 240 + } + ], + "title": "Limit memory", + "transform": "table", + "type": "table" + } + ], + "refresh": "5m", + "schemaVersion": 18, + "style": "dark", + "tags": [], + "templating": { + "list": [ + { + "current": { + "selected": false, + "text": "default", + "value": "default" + }, + "hide": 0, + "includeAll": false, + "label": "datasource", + "multi": false, + "name": "DS_PROMETHEUS", + "options": [], + "query": "prometheus", + "refresh": 1, + "regex": "", + "skipUrlSync": false, + "type": "datasource" + }, + { + "allValue": null, + "current": {}, + "datasource": "${DS_PROMETHEUS}", + "definition": "label_values(container_cpu_user_seconds_total, job)", + "hide": 0, + "includeAll": false, + "label": "Job", + "multi": false, + "name": "job", + "options": [], + "query": "label_values(container_cpu_user_seconds_total, job)", + "refresh": 1, + "regex": "", + "skipUrlSync": false, + "sort": 1, + "tagValuesQuery": "", + "tags": [], + "tagsQuery": "", + "type": "query", + "useTags": false + }, + { + "allValue": null, + "current": {}, + "datasource": "${DS_PROMETHEUS}", + "definition": "label_values(container_cpu_user_seconds_total{job=~\"$job\"}, instance)", + "hide": 0, + "includeAll": false, + "label": "Host:", + "multi": false, + "name": "node", + "options": [], + "query": "label_values(container_cpu_user_seconds_total{job=~\"$job\"}, instance)", + "refresh": 1, + "regex": "/([^:]+):.*/", + "skipUrlSync": false, + "sort": 1, + "tagValuesQuery": null, + "tags": [], + "tagsQuery": null, + "type": "query", + "useTags": false + }, + { + "allValue": null, + "current": {}, + "datasource": "${DS_PROMETHEUS}", + "definition": "label_values(container_cpu_user_seconds_total{instance=~\"$node:(.*)\"}, instance)", + "hide": 0, + "includeAll": false, + "label": "Port", + "multi": false, + "name": "port", + "options": [], + "query": "label_values(container_cpu_user_seconds_total{instance=~\"$node:(.*)\"}, instance)", + "refresh": 1, + "regex": "/[^:]+:(.*)/", + "skipUrlSync": false, + "sort": 3, + "tagValuesQuery": "", + "tags": [], + "tagsQuery": "", + "type": "query", + "useTags": false + }, + { + "auto": true, + "auto_count": 30, + "auto_min": "50s", + "current": { + "text": "1m", + "value": "1m" + }, + "hide": 0, + "label": "Interval", + "name": "interval", + "options": [ + { + "selected": false, + "text": "auto", + "value": "$__auto_interval_interval" + }, + { + "selected": true, + "text": "1m", + "value": "1m" + }, + { + "selected": false, + "text": "5m", + "value": "5m" + }, + { + "selected": false, + "text": "10m", + "value": "10m" + }, + { + "selected": false, + "text": "30m", + "value": "30m" + }, + { + "selected": false, + "text": "1h", + "value": "1h" + }, + { + "selected": false, + "text": "6h", + "value": "6h" + }, + { + "selected": false, + "text": "12h", + "value": "12h" + }, + { + "selected": false, + "text": "1d", + "value": "1d" + }, + { + "selected": false, + "text": "7d", + "value": "7d" + }, + { + "selected": false, + "text": "14d", + "value": "14d" + }, + { + "selected": false, + "text": "30d", + "value": "30d" + } + ], + "query": "1m,5m,10m,30m,1h,6h,12h,1d,7d,14d,30d", + "refresh": 2, + "skipUrlSync": false, + "type": "interval" + } + ] + }, + "time": { + "from": "now-4h", + "to": "now" + }, + "timepicker": { + "refresh_intervals": [ + "5s", + "10s", + "30s", + "1m", + "5m", + "15m", + "30m", + "1h", + "2h", + "1d" + ], + "time_options": [ + "5m", + "15m", + "1h", + "6h", + "12h", + "24h", + "2d", + "7d", + "30d" + ] + }, + "timezone": "browser", + "title": "Docker cAdvisor", + "version": 1 +} diff --git a/terraform-ci-infra/1n_nmd/grafana/conf/node_exporter.json b/terraform-ci-infra/1n_nmd/grafana/conf/node_exporter.json new file mode 100644 index 0000000000..766d5afec3 --- /dev/null +++ b/terraform-ci-infra/1n_nmd/grafana/conf/node_exporter.json @@ -0,0 +1,13696 @@ +{ + "__inputs": [ + { + "name": "DS_PROMETHEUS", + "label": "Prometheus", + "description": "", + "type": "datasource", + "pluginId": "prometheus", + "pluginName": "Prometheus" + } + ], + "__requires": [ + { + "type": "panel", + "id": "gauge", + "name": "Gauge", + "version": "" + }, + { + "type": "grafana", + "id": "grafana", + "name": "Grafana", + "version": "6.7.3" + }, + { + "type": "panel", + "id": "graph", + "name": "Graph", + "version": "" + }, + { + "type": "datasource", + "id": "prometheus", + "name": "Prometheus", + "version": "1.0.0" + }, + { + "type": "panel", + "id": "singlestat", + "name": "Singlestat", + "version": "" + } + ], + "annotations": { + "list": [ + { + "$$hashKey": "object:1058", + "builtIn": 1, + "datasource": "-- Grafana --", + "enable": true, + "hide": true, + "iconColor": "rgba(0, 211, 255, 1)", + "name": "Annotations & Alerts", + "type": "dashboard" + } + ] + }, + "editable": true, + "gnetId": 1860, + "graphTooltip": 0, + "id": null, + "iteration": 1595837627257, + "links": [], + "panels": [ + { + "collapsed": false, + "datasource": "${DS_PROMETHEUS}", + "gridPos": { + "h": 1, + "w": 24, + "x": 0, + "y": 0 + }, + "id": 261, + "panels": [], + "repeat": null, + "title": "Quick CPU / Mem / Disk", + "type": "row" + }, + { + "cacheTimeout": null, + "datasource": "${DS_PROMETHEUS}", + "description": "Busy state of all CPU cores together", + "gridPos": { + "h": 4, + "w": 3, + "x": 0, + "y": 1 + }, + "id": 20, + "links": [], + "options": { + "fieldOptions": { + "calcs": [ + "lastNotNull" + ], + "defaults": { + "color": { + "mode": "thresholds" + }, + "mappings": [ + { + "id": 0, + "op": "=", + "text": "N/A", + "type": 1, + "value": "null" + } + ], + "max": 100, + "min": 0, + "nullValueMode": "null", + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "rgba(50, 172, 45, 0.97)", + "value": null + }, + { + "color": "rgba(237, 129, 40, 0.89)", + "value": 85 + }, + { + "color": "rgba(245, 54, 54, 0.9)", + "value": 95 + } + ] + }, + "unit": "percent" + }, + "overrides": [], + "values": false + }, + "orientation": "horizontal", + "showThresholdLabels": false, + "showThresholdMarkers": true + }, + "pluginVersion": "6.7.3", + "targets": [ + { + "expr": "(((count(count(node_cpu_seconds_total{instance=\"$node\",job=\"$job\"}) by (cpu))) - avg(sum by (mode)(irate(node_cpu_seconds_total{mode='idle',instance=\"$node\",job=\"$job\"}[5m])))) * 100) / count(count(node_cpu_seconds_total{instance=\"$node\",job=\"$job\"}) by (cpu))", + "hide": false, + "intervalFactor": 1, + "legendFormat": "", + "refId": "A", + "step": 900 + } + ], + "title": "CPU Busy", + "type": "gauge" + }, + { + "cacheTimeout": null, + "datasource": "${DS_PROMETHEUS}", + "description": "Busy state of all CPU cores together (5 min average)", + "gridPos": { + "h": 4, + "w": 3, + "x": 3, + "y": 1 + }, + "id": 155, + "links": [], + "options": { + "fieldOptions": { + "calcs": [ + "lastNotNull" + ], + "defaults": { + "color": { + "mode": "thresholds" + }, + "mappings": [ + { + "id": 0, + "op": "=", + "text": "N/A", + "type": 1, + "value": "null" + } + ], + "max": 100, + "min": 0, + "nullValueMode": "null", + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "rgba(50, 172, 45, 0.97)", + "value": null + }, + { + "color": "rgba(237, 129, 40, 0.89)", + "value": 85 + }, + { + "color": "rgba(245, 54, 54, 0.9)", + "value": 95 + } + ] + }, + "unit": "percent" + }, + "overrides": [], + "values": false + }, + "orientation": "horizontal", + "showThresholdLabels": false, + "showThresholdMarkers": true + }, + "pluginVersion": "6.7.3", + "targets": [ + { + "expr": "avg(node_load5{instance=\"$node\",job=\"$job\"}) / count(count(node_cpu_seconds_total{instance=\"$node\",job=\"$job\"}) by (cpu)) * 100", + "format": "time_series", + "hide": false, + "intervalFactor": 1, + "refId": "A", + "step": 900 + } + ], + "title": "Sys Load (5m avg)", + "type": "gauge" + }, + { + "cacheTimeout": null, + "datasource": "${DS_PROMETHEUS}", + "description": "Busy state of all CPU cores together (15 min average)", + "gridPos": { + "h": 4, + "w": 3, + "x": 6, + "y": 1 + }, + "id": 19, + "links": [], + "options": { + "fieldOptions": { + "calcs": [ + "lastNotNull" + ], + "defaults": { + "color": { + "mode": "thresholds" + }, + "mappings": [ + { + "id": 0, + "op": "=", + "text": "N/A", + "type": 1, + "value": "null" + } + ], + "max": 100, + "min": 0, + "nullValueMode": "null", + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "rgba(50, 172, 45, 0.97)", + "value": null + }, + { + "color": "rgba(237, 129, 40, 0.89)", + "value": 85 + }, + { + "color": "rgba(245, 54, 54, 0.9)", + "value": 95 + } + ] + }, + "unit": "percent" + }, + "overrides": [], + "values": false + }, + "orientation": "horizontal", + "showThresholdLabels": false, + "showThresholdMarkers": true + }, + "pluginVersion": "6.7.3", + "targets": [ + { + "expr": "avg(node_load15{instance=\"$node\",job=\"$job\"}) / count(count(node_cpu_seconds_total{instance=\"$node\",job=\"$job\"}) by (cpu)) * 100", + "hide": false, + "intervalFactor": 1, + "refId": "A", + "step": 900 + } + ], + "title": "Sys Load (15m avg)", + "type": "gauge" + }, + { + "cacheTimeout": null, + "datasource": "${DS_PROMETHEUS}", + "description": "Non available RAM memory", + "gridPos": { + "h": 4, + "w": 3, + "x": 9, + "y": 1 + }, + "hideTimeOverride": false, + "id": 16, + "links": [], + "options": { + "fieldOptions": { + "calcs": [ + "lastNotNull" + ], + "defaults": { + "color": { + "mode": "thresholds" + }, + "decimals": 0, + "mappings": [], + "max": 100, + "min": 0, + "nullValueMode": "null", + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "rgba(50, 172, 45, 0.97)", + "value": null + }, + { + "color": "rgba(237, 129, 40, 0.89)", + "value": 80 + }, + { + "color": "rgba(245, 54, 54, 0.9)", + "value": 90 + } + ] + }, + "unit": "percent" + }, + "overrides": [], + "values": false + }, + "orientation": "horizontal", + "showThresholdLabels": false, + "showThresholdMarkers": true + }, + "pluginVersion": "6.7.3", + "targets": [ + { + "expr": "((node_memory_MemTotal_bytes{instance=\"$node\",job=\"$job\"} - node_memory_MemFree_bytes{instance=\"$node\",job=\"$job\"}) / (node_memory_MemTotal_bytes{instance=\"$node\",job=\"$job\"} )) * 100", + "format": "time_series", + "hide": true, + "intervalFactor": 1, + "refId": "A", + "step": 900 + }, + { + "expr": "100 - ((node_memory_MemAvailable_bytes{instance=\"$node\",job=\"$job\"} * 100) / node_memory_MemTotal_bytes{instance=\"$node\",job=\"$job\"})", + "format": "time_series", + "hide": false, + "intervalFactor": 1, + "refId": "B", + "step": 900 + } + ], + "title": "RAM Used", + "type": "gauge" + }, + { + "cacheTimeout": null, + "datasource": "${DS_PROMETHEUS}", + "description": "Used Swap", + "gridPos": { + "h": 4, + "w": 3, + "x": 12, + "y": 1 + }, + "id": 21, + "links": [], + "options": { + "fieldOptions": { + "calcs": [ + "lastNotNull" + ], + "defaults": { + "color": { + "mode": "thresholds" + }, + "mappings": [ + { + "id": 0, + "op": "=", + "text": "N/A", + "type": 1, + "value": "null" + } + ], + "max": 100, + "min": 0, + "nullValueMode": "null", + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "rgba(50, 172, 45, 0.97)", + "value": null + }, + { + "color": "rgba(237, 129, 40, 0.89)", + "value": 10 + }, + { + "color": "rgba(245, 54, 54, 0.9)", + "value": 25 + } + ] + }, + "unit": "percent" + }, + "overrides": [], + "values": false + }, + "orientation": "horizontal", + "showThresholdLabels": false, + "showThresholdMarkers": true + }, + "pluginVersion": "6.7.3", + "targets": [ + { + "expr": "((node_memory_SwapTotal_bytes{instance=\"$node\",job=\"$job\"} - node_memory_SwapFree_bytes{instance=\"$node\",job=\"$job\"}) / (node_memory_SwapTotal_bytes{instance=\"$node\",job=\"$job\"} )) * 100", + "intervalFactor": 1, + "refId": "A", + "step": 900 + } + ], + "title": "SWAP Used", + "type": "gauge" + }, + { + "cacheTimeout": null, + "datasource": "${DS_PROMETHEUS}", + "description": "Used Root FS", + "gridPos": { + "h": 4, + "w": 3, + "x": 15, + "y": 1 + }, + "id": 154, + "links": [], + "options": { + "fieldOptions": { + "calcs": [ + "lastNotNull" + ], + "defaults": { + "color": { + "mode": "thresholds" + }, + "mappings": [ + { + "id": 0, + "op": "=", + "text": "N/A", + "type": 1, + "value": "null" + } + ], + "max": 100, + "min": 0, + "nullValueMode": "null", + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "rgba(50, 172, 45, 0.97)", + "value": null + }, + { + "color": "rgba(237, 129, 40, 0.89)", + "value": 80 + }, + { + "color": "rgba(245, 54, 54, 0.9)", + "value": 90 + } + ] + }, + "unit": "percent" + }, + "overrides": [], + "values": false + }, + "orientation": "horizontal", + "showThresholdLabels": false, + "showThresholdMarkers": true + }, + "pluginVersion": "6.7.3", + "targets": [ + { + "expr": "100 - ((node_filesystem_avail_bytes{instance=\"$node\",job=\"$job\",mountpoint=\"/\",fstype!=\"rootfs\"} * 100) / node_filesystem_size_bytes{instance=\"$node\",job=\"$job\",mountpoint=\"/\",fstype!=\"rootfs\"})", + "format": "time_series", + "intervalFactor": 1, + "refId": "A", + "step": 900 + } + ], + "title": "Root FS Used", + "type": "gauge" + }, + { + "cacheTimeout": null, + "colorBackground": false, + "colorValue": false, + "colors": [ + "rgba(245, 54, 54, 0.9)", + "rgba(237, 129, 40, 0.89)", + "rgba(50, 172, 45, 0.97)" + ], + "datasource": "${DS_PROMETHEUS}", + "description": "Total number of CPU cores", + "format": "short", + "gauge": { + "maxValue": 100, + "minValue": 0, + "show": false, + "thresholdLabels": false, + "thresholdMarkers": true + }, + "gridPos": { + "h": 2, + "w": 2, + "x": 18, + "y": 1 + }, + "id": 14, + "interval": null, + "links": [], + "mappingType": 1, + "mappingTypes": [ + { + "name": "value to text", + "value": 1 + }, + { + "name": "range to text", + "value": 2 + } + ], + "maxDataPoints": 100, + "maxPerRow": 6, + "nullPointMode": "null", + "nullText": null, + "postfix": "", + "postfixFontSize": "50%", + "prefix": "", + "prefixFontSize": "50%", + "rangeMaps": [ + { + "from": "null", + "text": "N/A", + "to": "null" + } + ], + "sparkline": { + "fillColor": "rgba(31, 118, 189, 0.18)", + "full": false, + "lineColor": "rgb(31, 120, 193)", + "show": false + }, + "tableColumn": "", + "targets": [ + { + "expr": "count(count(node_cpu_seconds_total{instance=\"$node\",job=\"$job\"}) by (cpu))", + "interval": "", + "intervalFactor": 1, + "legendFormat": "", + "refId": "A", + "step": 900 + } + ], + "thresholds": "", + "title": "CPU Cores", + "type": "singlestat", + "valueFontSize": "50%", + "valueMaps": [ + { + "op": "=", + "text": "N/A", + "value": "null" + } + ], + "valueName": "current" + }, + { + "cacheTimeout": null, + "colorBackground": false, + "colorValue": false, + "colors": [ + "rgba(245, 54, 54, 0.9)", + "rgba(237, 129, 40, 0.89)", + "rgba(50, 172, 45, 0.97)" + ], + "datasource": "${DS_PROMETHEUS}", + "decimals": 1, + "description": "System uptime", + "format": "s", + "gauge": { + "maxValue": 100, + "minValue": 0, + "show": false, + "thresholdLabels": false, + "thresholdMarkers": true + }, + "gridPos": { + "h": 2, + "w": 4, + "x": 20, + "y": 1 + }, + "hideTimeOverride": true, + "id": 15, + "interval": null, + "links": [], + "mappingType": 1, + "mappingTypes": [ + { + "$$hashKey": "object:1094", + "name": "value to text", + "value": 1 + }, + { + "$$hashKey": "object:1095", + "name": "range to text", + "value": 2 + } + ], + "maxDataPoints": 100, + "nullPointMode": "null", + "nullText": null, + "postfix": "s", + "postfixFontSize": "50%", + "prefix": "", + "prefixFontSize": "50%", + "rangeMaps": [ + { + "from": "null", + "text": "N/A", + "to": "null" + } + ], + "sparkline": { + "fillColor": "rgba(31, 118, 189, 0.18)", + "full": false, + "lineColor": "rgb(31, 120, 193)", + "show": false + }, + "tableColumn": "", + "targets": [ + { + "expr": "node_time_seconds{instance=\"$node\",job=\"$job\"} - node_boot_time_seconds{instance=\"$node\",job=\"$job\"}", + "intervalFactor": 2, + "refId": "A", + "step": 1800 + } + ], + "thresholds": "", + "title": "Uptime", + "type": "singlestat", + "valueFontSize": "50%", + "valueMaps": [ + { + "$$hashKey": "object:1097", + "op": "=", + "text": "N/A", + "value": "null" + } + ], + "valueName": "current" + }, + { + "cacheTimeout": null, + "colorBackground": false, + "colorValue": false, + "colors": [ + "rgba(50, 172, 45, 0.97)", + "rgba(237, 129, 40, 0.89)", + "rgba(245, 54, 54, 0.9)" + ], + "datasource": "${DS_PROMETHEUS}", + "decimals": 0, + "description": "Total RootFS", + "format": "bytes", + "gauge": { + "maxValue": 100, + "minValue": 0, + "show": false, + "thresholdLabels": false, + "thresholdMarkers": true + }, + "gridPos": { + "h": 2, + "w": 2, + "x": 18, + "y": 3 + }, + "id": 23, + "interval": null, + "links": [], + "mappingType": 1, + "mappingTypes": [ + { + "name": "value to text", + "value": 1 + }, + { + "name": "range to text", + "value": 2 + } + ], + "maxDataPoints": 100, + "maxPerRow": 6, + "nullPointMode": "null", + "nullText": null, + "postfix": "", + "postfixFontSize": "50%", + "prefix": "", + "prefixFontSize": "50%", + "rangeMaps": [ + { + "from": "null", + "text": "N/A", + "to": "null" + } + ], + "sparkline": { + "fillColor": "rgba(31, 118, 189, 0.18)", + "full": false, + "lineColor": "rgb(31, 120, 193)", + "show": false + }, + "tableColumn": "", + "targets": [ + { + "expr": "node_filesystem_size_bytes{instance=\"$node\",job=\"$job\",mountpoint=\"/\",fstype!=\"rootfs\"}", + "format": "time_series", + "hide": false, + "intervalFactor": 1, + "refId": "A", + "step": 900 + } + ], + "thresholds": "70,90", + "title": "RootFS Total", + "type": "singlestat", + "valueFontSize": "50%", + "valueMaps": [ + { + "op": "=", + "text": "N/A", + "value": "null" + } + ], + "valueName": "current" + }, + { + "cacheTimeout": null, + "colorBackground": false, + "colorValue": false, + "colors": [ + "rgba(245, 54, 54, 0.9)", + "rgba(237, 129, 40, 0.89)", + "rgba(50, 172, 45, 0.97)" + ], + "datasource": "${DS_PROMETHEUS}", + "decimals": 0, + "description": "Total RAM", + "format": "bytes", + "gauge": { + "maxValue": 100, + "minValue": 0, + "show": false, + "thresholdLabels": false, + "thresholdMarkers": true + }, + "gridPos": { + "h": 2, + "w": 2, + "x": 20, + "y": 3 + }, + "id": 75, + "interval": null, + "links": [], + "mappingType": 1, + "mappingTypes": [ + { + "name": "value to text", + "value": 1 + }, + { + "name": "range to text", + "value": 2 + } + ], + "maxDataPoints": 100, + "maxPerRow": 6, + "nullPointMode": "null", + "nullText": null, + "postfix": "", + "postfixFontSize": "70%", + "prefix": "", + "prefixFontSize": "50%", + "rangeMaps": [ + { + "from": "null", + "text": "N/A", + "to": "null" + } + ], + "sparkline": { + "fillColor": "rgba(31, 118, 189, 0.18)", + "full": false, + "lineColor": "rgb(31, 120, 193)", + "show": false + }, + "tableColumn": "", + "targets": [ + { + "expr": "node_memory_MemTotal_bytes{instance=\"$node\",job=\"$job\"}", + "intervalFactor": 1, + "refId": "A", + "step": 900 + } + ], + "thresholds": "", + "title": "RAM Total", + "type": "singlestat", + "valueFontSize": "50%", + "valueMaps": [ + { + "op": "=", + "text": "N/A", + "value": "null" + } + ], + "valueName": "current" + }, + { + "cacheTimeout": null, + "colorBackground": false, + "colorValue": false, + "colors": [ + "rgba(245, 54, 54, 0.9)", + "rgba(237, 129, 40, 0.89)", + "rgba(50, 172, 45, 0.97)" + ], + "datasource": "${DS_PROMETHEUS}", + "decimals": 0, + "description": "Total SWAP", + "format": "bytes", + "gauge": { + "maxValue": 100, + "minValue": 0, + "show": false, + "thresholdLabels": false, + "thresholdMarkers": true + }, + "gridPos": { + "h": 2, + "w": 2, + "x": 22, + "y": 3 + }, + "id": 18, + "interval": null, + "links": [], + "mappingType": 1, + "mappingTypes": [ + { + "name": "value to text", + "value": 1 + }, + { + "name": "range to text", + "value": 2 + } + ], + "maxDataPoints": 100, + "maxPerRow": 6, + "nullPointMode": "null", + "nullText": null, + "postfix": "", + "postfixFontSize": "70%", + "prefix": "", + "prefixFontSize": "50%", + "rangeMaps": [ + { + "from": "null", + "text": "N/A", + "to": "null" + } + ], + "sparkline": { + "fillColor": "rgba(31, 118, 189, 0.18)", + "full": false, + "lineColor": "rgb(31, 120, 193)", + "show": false + }, + "tableColumn": "", + "targets": [ + { + "expr": "node_memory_SwapTotal_bytes{instance=\"$node\",job=\"$job\"}", + "intervalFactor": 1, + "refId": "A", + "step": 900 + } + ], + "thresholds": "", + "title": "SWAP Total", + "type": "singlestat", + "valueFontSize": "50%", + "valueMaps": [ + { + "op": "=", + "text": "N/A", + "value": "null" + } + ], + "valueName": "current" + }, + { + "collapsed": false, + "datasource": "${DS_PROMETHEUS}", + "gridPos": { + "h": 1, + "w": 24, + "x": 0, + "y": 5 + }, + "id": 263, + "panels": [], + "repeat": null, + "title": "Basic CPU / Mem / Net / Disk", + "type": "row" + }, + { + "aliasColors": { + "Busy": "#EAB839", + "Busy Iowait": "#890F02", + "Busy other": "#1F78C1", + "Idle": "#052B51", + "Idle - Waiting for something to happen": "#052B51", + "guest": "#9AC48A", + "idle": "#052B51", + "iowait": "#EAB839", + "irq": "#BF1B00", + "nice": "#C15C17", + "softirq": "#E24D42", + "steal": "#FCE2DE", + "system": "#508642", + "user": "#5195CE" + }, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "${DS_PROMETHEUS}", + "decimals": 2, + "description": "Basic CPU info", + "fill": 4, + "fillGradient": 0, + "gridPos": { + "h": 7, + "w": 12, + "x": 0, + "y": 6 + }, + "hiddenSeries": false, + "id": 77, + "legend": { + "alignAsTable": false, + "avg": false, + "current": false, + "max": false, + "min": false, + "rightSide": false, + "show": true, + "sideWidth": 250, + "sort": null, + "sortDesc": null, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 1, + "links": [], + "maxPerRow": 6, + "nullPointMode": "null", + "options": { + "dataLinks": [] + }, + "percentage": true, + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [ + { + "alias": "Busy Iowait", + "color": "#890F02" + }, + { + "alias": "Idle", + "color": "#7EB26D" + }, + { + "alias": "Busy System", + "color": "#EAB839" + }, + { + "alias": "Busy User", + "color": "#0A437C" + }, + { + "alias": "Busy Other", + "color": "#6D1F62" + } + ], + "spaceLength": 10, + "stack": true, + "steppedLine": false, + "targets": [ + { + "expr": "sum by (instance)(irate(node_cpu_seconds_total{mode=\"system\",instance=\"$node\",job=\"$job\"}[5m])) * 100", + "format": "time_series", + "hide": false, + "intervalFactor": 2, + "legendFormat": "Busy System", + "refId": "A", + "step": 240 + }, + { + "expr": "sum by (instance)(irate(node_cpu_seconds_total{mode='user',instance=\"$node\",job=\"$job\"}[5m])) * 100", + "format": "time_series", + "hide": false, + "intervalFactor": 2, + "legendFormat": "Busy User", + "refId": "B", + "step": 240 + }, + { + "expr": "sum by (instance)(irate(node_cpu_seconds_total{mode='iowait',instance=\"$node\",job=\"$job\"}[5m])) * 100", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "Busy Iowait", + "refId": "C", + "step": 240 + }, + { + "expr": "sum by (instance)(irate(node_cpu_seconds_total{mode=~\".*irq\",instance=\"$node\",job=\"$job\"}[5m])) * 100", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "Busy IRQs", + "refId": "D", + "step": 240 + }, + { + "expr": "sum (irate(node_cpu_seconds_total{mode!='idle',mode!='user',mode!='system',mode!='iowait',mode!='irq',mode!='softirq',instance=\"$node\",job=\"$job\"}[5m])) * 100", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "Busy Other", + "refId": "E", + "step": 240 + }, + { + "expr": "sum by (mode)(irate(node_cpu_seconds_total{mode='idle',instance=\"$node\",job=\"$job\"}[5m])) * 100", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "Idle", + "refId": "F", + "step": 240 + } + ], + "thresholds": [], + "timeFrom": null, + "timeRegions": [], + "timeShift": null, + "title": "CPU Basic", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "format": "short", + "label": "", + "logBase": 1, + "max": "100", + "min": "0", + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": false + } + ], + "yaxis": { + "align": false, + "alignLevel": null + } + }, + { + "aliasColors": { + "Apps": "#629E51", + "Buffers": "#614D93", + "Cache": "#6D1F62", + "Cached": "#511749", + "Committed": "#508642", + "Free": "#0A437C", + "Hardware Corrupted - Amount of RAM that the kernel identified as corrupted / not working": "#CFFAFF", + "Inactive": "#584477", + "PageTables": "#0A50A1", + "Page_Tables": "#0A50A1", + "RAM_Free": "#E0F9D7", + "SWAP Used": "#BF1B00", + "Slab": "#806EB7", + "Slab_Cache": "#E0752D", + "Swap": "#BF1B00", + "Swap Used": "#BF1B00", + "Swap_Cache": "#C15C17", + "Swap_Free": "#2F575E", + "Unused": "#EAB839" + }, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "${DS_PROMETHEUS}", + "decimals": 2, + "description": "Basic memory usage", + "fill": 4, + "fillGradient": 0, + "gridPos": { + "h": 7, + "w": 12, + "x": 12, + "y": 6 + }, + "hiddenSeries": false, + "id": 78, + "legend": { + "alignAsTable": false, + "avg": false, + "current": false, + "max": false, + "min": false, + "rightSide": false, + "show": true, + "sideWidth": 350, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 1, + "links": [], + "maxPerRow": 6, + "nullPointMode": "null", + "options": { + "dataLinks": [] + }, + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [ + { + "alias": "RAM Total", + "color": "#E0F9D7", + "fill": 0, + "stack": false + }, + { + "alias": "RAM Cache + Buffer", + "color": "#052B51" + }, + { + "alias": "RAM Free", + "color": "#7EB26D" + }, + { + "alias": "Avaliable", + "color": "#DEDAF7", + "fill": 0, + "stack": false + } + ], + "spaceLength": 10, + "stack": true, + "steppedLine": false, + "targets": [ + { + "expr": "node_memory_MemTotal_bytes{instance=\"$node\",job=\"$job\"}", + "format": "time_series", + "hide": false, + "intervalFactor": 2, + "legendFormat": "RAM Total", + "refId": "A", + "step": 240 + }, + { + "expr": "node_memory_MemTotal_bytes{instance=\"$node\",job=\"$job\"} - node_memory_MemFree_bytes{instance=\"$node\",job=\"$job\"} - (node_memory_Cached_bytes{instance=\"$node\",job=\"$job\"} + node_memory_Buffers_bytes{instance=\"$node\",job=\"$job\"})", + "format": "time_series", + "hide": false, + "intervalFactor": 2, + "legendFormat": "RAM Used", + "refId": "B", + "step": 240 + }, + { + "expr": "node_memory_Cached_bytes{instance=\"$node\",job=\"$job\"} + node_memory_Buffers_bytes{instance=\"$node\",job=\"$job\"}", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "RAM Cache + Buffer", + "refId": "C", + "step": 240 + }, + { + "expr": "node_memory_MemFree_bytes{instance=\"$node\",job=\"$job\"}", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "RAM Free", + "refId": "D", + "step": 240 + }, + { + "expr": "(node_memory_SwapTotal_bytes{instance=\"$node\",job=\"$job\"} - node_memory_SwapFree_bytes{instance=\"$node\",job=\"$job\"})", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "SWAP Used", + "refId": "E", + "step": 240 + } + ], + "thresholds": [], + "timeFrom": null, + "timeRegions": [], + "timeShift": null, + "title": "Memory Basic", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "format": "bytes", + "label": "", + "logBase": 1, + "max": null, + "min": "0", + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": false + } + ], + "yaxis": { + "align": false, + "alignLevel": null + } + }, + { + "aliasColors": { + "Recv_bytes_eth2": "#7EB26D", + "Recv_bytes_lo": "#0A50A1", + "Recv_drop_eth2": "#6ED0E0", + "Recv_drop_lo": "#E0F9D7", + "Recv_errs_eth2": "#BF1B00", + "Recv_errs_lo": "#CCA300", + "Trans_bytes_eth2": "#7EB26D", + "Trans_bytes_lo": "#0A50A1", + "Trans_drop_eth2": "#6ED0E0", + "Trans_drop_lo": "#E0F9D7", + "Trans_errs_eth2": "#BF1B00", + "Trans_errs_lo": "#CCA300", + "recv_bytes_lo": "#0A50A1", + "recv_drop_eth0": "#99440A", + "recv_drop_lo": "#967302", + "recv_errs_eth0": "#BF1B00", + "recv_errs_lo": "#890F02", + "trans_bytes_eth0": "#7EB26D", + "trans_bytes_lo": "#0A50A1", + "trans_drop_eth0": "#99440A", + "trans_drop_lo": "#967302", + "trans_errs_eth0": "#BF1B00", + "trans_errs_lo": "#890F02" + }, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "${DS_PROMETHEUS}", + "description": "Basic network info per interface", + "fill": 4, + "fillGradient": 0, + "gridPos": { + "h": 7, + "w": 12, + "x": 0, + "y": 13 + }, + "hiddenSeries": false, + "id": 74, + "legend": { + "alignAsTable": false, + "avg": false, + "current": false, + "hideEmpty": false, + "hideZero": false, + "max": false, + "min": false, + "rightSide": false, + "show": true, + "sort": "current", + "sortDesc": true, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 1, + "links": [], + "nullPointMode": "null", + "options": { + "dataLinks": [] + }, + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [ + { + "alias": "/.*trans.*/", + "transform": "negative-Y" + } + ], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "irate(node_network_receive_bytes_total{instance=\"$node\",job=\"$job\"}[5m])*8", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "recv {{device}}", + "refId": "A", + "step": 240 + }, + { + "expr": "irate(node_network_transmit_bytes_total{instance=\"$node\",job=\"$job\"}[5m])*8", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "trans {{device}} ", + "refId": "B", + "step": 240 + } + ], + "thresholds": [], + "timeFrom": null, + "timeRegions": [], + "timeShift": null, + "title": "Network Traffic Basic", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "format": "bps", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + }, + { + "format": "pps", + "label": "", + "logBase": 1, + "max": null, + "min": null, + "show": false + } + ], + "yaxis": { + "align": false, + "alignLevel": null + } + }, + { + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "${DS_PROMETHEUS}", + "decimals": 3, + "description": "Disk space used of all filesystems mounted", + "fill": 4, + "fillGradient": 0, + "gridPos": { + "h": 7, + "w": 12, + "x": 12, + "y": 13 + }, + "height": "", + "hiddenSeries": false, + "id": 152, + "legend": { + "alignAsTable": false, + "avg": false, + "current": false, + "max": false, + "min": false, + "rightSide": false, + "show": true, + "sort": "current", + "sortDesc": false, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 1, + "links": [], + "maxPerRow": 6, + "nullPointMode": "null", + "options": { + "dataLinks": [] + }, + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "100 - ((node_filesystem_avail_bytes{instance=\"$node\",job=\"$job\",device!~'rootfs'} * 100) / node_filesystem_size_bytes{instance=\"$node\",job=\"$job\",device!~'rootfs'})", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "{{mountpoint}}", + "refId": "A", + "step": 240 + } + ], + "thresholds": [], + "timeFrom": null, + "timeRegions": [], + "timeShift": null, + "title": "Disk Space Used Basic", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "format": "percent", + "label": null, + "logBase": 1, + "max": "100", + "min": "0", + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + } + ], + "yaxis": { + "align": false, + "alignLevel": null + } + }, + { + "collapsed": true, + "datasource": "${DS_PROMETHEUS}", + "gridPos": { + "h": 1, + "w": 24, + "x": 0, + "y": 20 + }, + "id": 265, + "panels": [ + { + "aliasColors": { + "Idle - Waiting for something to happen": "#052B51", + "guest": "#9AC48A", + "idle": "#052B51", + "iowait": "#EAB839", + "irq": "#BF1B00", + "nice": "#C15C17", + "softirq": "#E24D42", + "steal": "#FCE2DE", + "system": "#508642", + "user": "#5195CE" + }, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "${DS_PROMETHEUS}", + "decimals": 2, + "description": "", + "fill": 4, + "fillGradient": 0, + "gridPos": { + "h": 12, + "w": 12, + "x": 0, + "y": 21 + }, + "hiddenSeries": false, + "id": 3, + "legend": { + "alignAsTable": true, + "avg": true, + "current": true, + "max": true, + "min": true, + "rightSide": false, + "show": true, + "sideWidth": 250, + "sort": null, + "sortDesc": null, + "total": false, + "values": true + }, + "lines": true, + "linewidth": 1, + "links": [], + "maxPerRow": 6, + "nullPointMode": "null", + "options": { + "dataLinks": [] + }, + "percentage": true, + "pointradius": 5, + "points": false, + "renderer": "flot", + "repeat": null, + "seriesOverrides": [], + "spaceLength": 10, + "stack": true, + "steppedLine": false, + "targets": [ + { + "expr": "sum by (mode)(irate(node_cpu_seconds_total{mode=\"system\",instance=\"$node\",job=\"$job\"}[5m])) * 100", + "format": "time_series", + "interval": "10s", + "intervalFactor": 2, + "legendFormat": "System - Processes executing in kernel mode", + "refId": "A", + "step": 20 + }, + { + "expr": "sum by (mode)(irate(node_cpu_seconds_total{mode='user',instance=\"$node\",job=\"$job\"}[5m])) * 100", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "User - Normal processes executing in user mode", + "refId": "B", + "step": 240 + }, + { + "expr": "sum by (mode)(irate(node_cpu_seconds_total{mode='nice',instance=\"$node\",job=\"$job\"}[5m])) * 100", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "Nice - Niced processes executing in user mode", + "refId": "C", + "step": 240 + }, + { + "expr": "sum by (mode)(irate(node_cpu_seconds_total{mode='idle',instance=\"$node\",job=\"$job\"}[5m])) * 100", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "Idle - Waiting for something to happen", + "refId": "D", + "step": 240 + }, + { + "expr": "sum by (mode)(irate(node_cpu_seconds_total{mode='iowait',instance=\"$node\",job=\"$job\"}[5m])) * 100", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "Iowait - Waiting for I/O to complete", + "refId": "E", + "step": 240 + }, + { + "expr": "sum by (mode)(irate(node_cpu_seconds_total{mode='irq',instance=\"$node\",job=\"$job\"}[5m])) * 100", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "Irq - Servicing interrupts", + "refId": "F", + "step": 240 + }, + { + "expr": "sum by (mode)(irate(node_cpu_seconds_total{mode='softirq',instance=\"$node\",job=\"$job\"}[5m])) * 100", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "Softirq - Servicing softirqs", + "refId": "G", + "step": 240 + }, + { + "expr": "sum by (mode)(irate(node_cpu_seconds_total{mode='steal',instance=\"$node\",job=\"$job\"}[5m])) * 100", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "Steal - Time spent in other operating systems when running in a virtualized environment", + "refId": "H", + "step": 240 + }, + { + "expr": "sum by (mode)(irate(node_cpu_seconds_total{mode='guest',instance=\"$node\",job=\"$job\"}[5m])) * 100", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "Guest - Time spent running a virtual CPU for a guest operating system", + "refId": "I", + "step": 240 + } + ], + "thresholds": [], + "timeFrom": null, + "timeRegions": [], + "timeShift": null, + "title": "CPU", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "format": "short", + "label": "percentage", + "logBase": 1, + "max": "100", + "min": "0", + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": false + } + ], + "yaxis": { + "align": false, + "alignLevel": null + } + }, + { + "aliasColors": { + "Apps": "#629E51", + "Buffers": "#614D93", + "Cache": "#6D1F62", + "Cached": "#511749", + "Committed": "#508642", + "Free": "#0A437C", + "Hardware Corrupted - Amount of RAM that the kernel identified as corrupted / not working": "#CFFAFF", + "Inactive": "#584477", + "PageTables": "#0A50A1", + "Page_Tables": "#0A50A1", + "RAM_Free": "#E0F9D7", + "Slab": "#806EB7", + "Slab_Cache": "#E0752D", + "Swap": "#BF1B00", + "Swap - Swap memory usage": "#BF1B00", + "Swap_Cache": "#C15C17", + "Swap_Free": "#2F575E", + "Unused": "#EAB839", + "Unused - Free memory unassigned": "#052B51" + }, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "${DS_PROMETHEUS}", + "decimals": 2, + "description": "", + "fill": 4, + "fillGradient": 0, + "gridPos": { + "h": 12, + "w": 12, + "x": 12, + "y": 21 + }, + "hiddenSeries": false, + "id": 24, + "legend": { + "alignAsTable": true, + "avg": true, + "current": true, + "max": true, + "min": true, + "rightSide": false, + "show": true, + "sideWidth": 350, + "sort": null, + "sortDesc": null, + "total": false, + "values": true + }, + "lines": true, + "linewidth": 1, + "links": [], + "maxPerRow": 6, + "nullPointMode": "null", + "options": { + "dataLinks": [] + }, + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [ + { + "alias": "/.*Hardware Corrupted - *./", + "stack": false + } + ], + "spaceLength": 10, + "stack": true, + "steppedLine": false, + "targets": [ + { + "expr": "node_memory_MemTotal_bytes{instance=\"$node\",job=\"$job\"} - node_memory_MemFree_bytes{instance=\"$node\",job=\"$job\"} - node_memory_Buffers_bytes{instance=\"$node\",job=\"$job\"} - node_memory_Cached_bytes{instance=\"$node\",job=\"$job\"} - node_memory_Slab_bytes{instance=\"$node\",job=\"$job\"} - node_memory_PageTables_bytes{instance=\"$node\",job=\"$job\"} - node_memory_SwapCached_bytes{instance=\"$node\",job=\"$job\"}", + "format": "time_series", + "hide": false, + "intervalFactor": 2, + "legendFormat": "Apps - Memory used by user-space applications", + "refId": "A", + "step": 240 + }, + { + "expr": "node_memory_PageTables_bytes{instance=\"$node\",job=\"$job\"}", + "format": "time_series", + "hide": false, + "intervalFactor": 2, + "legendFormat": "PageTables - Memory used to map between virtual and physical memory addresses", + "refId": "B", + "step": 240 + }, + { + "expr": "node_memory_SwapCached_bytes{instance=\"$node\",job=\"$job\"}", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "SwapCache - Memory that keeps track of pages that have been fetched from swap but not yet been modified", + "refId": "C", + "step": 240 + }, + { + "expr": "node_memory_Slab_bytes{instance=\"$node\",job=\"$job\"}", + "format": "time_series", + "hide": false, + "intervalFactor": 2, + "legendFormat": "Slab - Memory used by the kernel to cache data structures for its own use (caches like inode, dentry, etc)", + "refId": "D", + "step": 240 + }, + { + "expr": "node_memory_Cached_bytes{instance=\"$node\",job=\"$job\"}", + "format": "time_series", + "hide": false, + "intervalFactor": 2, + "legendFormat": "Cache - Parked file data (file content) cache", + "refId": "E", + "step": 240 + }, + { + "expr": "node_memory_Buffers_bytes{instance=\"$node\",job=\"$job\"}", + "format": "time_series", + "hide": false, + "intervalFactor": 2, + "legendFormat": "Buffers - Block device (e.g. harddisk) cache", + "refId": "F", + "step": 240 + }, + { + "expr": "node_memory_MemFree_bytes{instance=\"$node\",job=\"$job\"}", + "format": "time_series", + "hide": false, + "intervalFactor": 2, + "legendFormat": "Unused - Free memory unassigned", + "refId": "G", + "step": 240 + }, + { + "expr": "(node_memory_SwapTotal_bytes{instance=\"$node\",job=\"$job\"} - node_memory_SwapFree_bytes{instance=\"$node\",job=\"$job\"})", + "format": "time_series", + "hide": false, + "intervalFactor": 2, + "legendFormat": "Swap - Swap space used", + "refId": "H", + "step": 240 + }, + { + "expr": "node_memory_HardwareCorrupted_bytes{instance=\"$node\",job=\"$job\"}", + "format": "time_series", + "hide": false, + "intervalFactor": 2, + "legendFormat": "Hardware Corrupted - Amount of RAM that the kernel identified as corrupted / not working", + "refId": "I", + "step": 240 + } + ], + "thresholds": [], + "timeFrom": null, + "timeRegions": [], + "timeShift": null, + "title": "Memory Stack", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "format": "bytes", + "label": "bytes", + "logBase": 1, + "max": null, + "min": "0", + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": false + } + ], + "yaxis": { + "align": false, + "alignLevel": null + } + }, + { + "aliasColors": { + "receive_packets_eth0": "#7EB26D", + "receive_packets_lo": "#E24D42", + "transmit_packets_eth0": "#7EB26D", + "transmit_packets_lo": "#E24D42" + }, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "${DS_PROMETHEUS}", + "fill": 4, + "fillGradient": 0, + "gridPos": { + "h": 12, + "w": 12, + "x": 0, + "y": 33 + }, + "hiddenSeries": false, + "id": 84, + "legend": { + "alignAsTable": true, + "avg": true, + "current": true, + "max": true, + "min": true, + "rightSide": false, + "show": true, + "total": false, + "values": true + }, + "lines": true, + "linewidth": 1, + "links": [], + "nullPointMode": "null", + "options": { + "dataLinks": [] + }, + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [ + { + "$$hashKey": "object:5871", + "alias": "/.*Trans.*/", + "transform": "negative-Y" + } + ], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "irate(node_network_receive_bytes_total{instance=\"$node\",job=\"$job\"}[5m])*8", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "{{device}} - Receive", + "refId": "A", + "step": 240 + }, + { + "expr": "irate(node_network_transmit_bytes_total{instance=\"$node\",job=\"$job\"}[5m])*8", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "{{device}} - Transmit", + "refId": "B", + "step": 240 + } + ], + "thresholds": [], + "timeFrom": null, + "timeRegions": [], + "timeShift": null, + "title": "Network Traffic", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "$$hashKey": "object:5884", + "format": "bps", + "label": "bits out (-) / in (+)", + "logBase": 1, + "max": null, + "min": null, + "show": true + }, + { + "$$hashKey": "object:5885", + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": false + } + ], + "yaxis": { + "align": false, + "alignLevel": null + } + }, + { + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "${DS_PROMETHEUS}", + "decimals": 3, + "description": "", + "fill": 4, + "fillGradient": 0, + "gridPos": { + "h": 12, + "w": 12, + "x": 12, + "y": 33 + }, + "height": "", + "hiddenSeries": false, + "id": 156, + "legend": { + "alignAsTable": true, + "avg": true, + "current": true, + "max": true, + "min": true, + "rightSide": false, + "show": true, + "sort": "current", + "sortDesc": false, + "total": false, + "values": true + }, + "lines": true, + "linewidth": 1, + "links": [], + "maxPerRow": 6, + "nullPointMode": "null", + "options": { + "dataLinks": [] + }, + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "node_filesystem_size_bytes{instance=\"$node\",job=\"$job\",device!~'rootfs'} - node_filesystem_avail_bytes{instance=\"$node\",job=\"$job\",device!~'rootfs'}", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "{{mountpoint}}", + "refId": "A", + "step": 240 + } + ], + "thresholds": [], + "timeFrom": null, + "timeRegions": [], + "timeShift": null, + "title": "Disk Space Used", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "format": "bytes", + "label": "bytes", + "logBase": 1, + "max": null, + "min": "0", + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": false + } + ], + "yaxis": { + "align": false, + "alignLevel": null + } + }, + { + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "${DS_PROMETHEUS}", + "description": "", + "fill": 2, + "fillGradient": 0, + "gridPos": { + "h": 12, + "w": 12, + "x": 0, + "y": 45 + }, + "hiddenSeries": false, + "id": 229, + "legend": { + "alignAsTable": true, + "avg": true, + "current": true, + "hideZero": true, + "max": true, + "min": true, + "rightSide": false, + "show": true, + "total": false, + "values": true + }, + "lines": true, + "linewidth": 1, + "links": [], + "maxPerRow": 6, + "nullPointMode": "null", + "options": { + "dataLinks": [] + }, + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [ + { + "alias": "/.*Read.*/", + "transform": "negative-Y" + }, + { + "alias": "/.*sda_.*/", + "color": "#7EB26D" + }, + { + "alias": "/.*sdb_.*/", + "color": "#EAB839" + }, + { + "alias": "/.*sdc_.*/", + "color": "#6ED0E0" + }, + { + "alias": "/.*sdd_.*/", + "color": "#EF843C" + }, + { + "alias": "/.*sde_.*/", + "color": "#E24D42" + }, + { + "alias": "/.*sda1.*/", + "color": "#584477" + }, + { + "alias": "/.*sda2_.*/", + "color": "#BA43A9" + }, + { + "alias": "/.*sda3_.*/", + "color": "#F4D598" + }, + { + "alias": "/.*sdb1.*/", + "color": "#0A50A1" + }, + { + "alias": "/.*sdb2.*/", + "color": "#BF1B00" + }, + { + "alias": "/.*sdb2.*/", + "color": "#BF1B00" + }, + { + "alias": "/.*sdb3.*/", + "color": "#E0752D" + }, + { + "alias": "/.*sdc1.*/", + "color": "#962D82" + }, + { + "alias": "/.*sdc2.*/", + "color": "#614D93" + }, + { + "alias": "/.*sdc3.*/", + "color": "#9AC48A" + }, + { + "alias": "/.*sdd1.*/", + "color": "#65C5DB" + }, + { + "alias": "/.*sdd2.*/", + "color": "#F9934E" + }, + { + "alias": "/.*sdd3.*/", + "color": "#EA6460" + }, + { + "alias": "/.*sde1.*/", + "color": "#E0F9D7" + }, + { + "alias": "/.*sdd2.*/", + "color": "#FCEACA" + }, + { + "alias": "/.*sde3.*/", + "color": "#F9E2D2" + } + ], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "irate(node_disk_reads_completed_total{instance=\"$node\",job=\"$job\",device=~\"$diskdevices\"}[5m])", + "intervalFactor": 4, + "legendFormat": "{{device}} - Reads completed", + "refId": "A", + "step": 480 + }, + { + "expr": "irate(node_disk_writes_completed_total{instance=\"$node\",job=\"$job\",device=~\"$diskdevices\"}[5m])", + "intervalFactor": 2, + "legendFormat": "{{device}} - Writes completed", + "refId": "B", + "step": 240 + } + ], + "thresholds": [], + "timeFrom": null, + "timeRegions": [], + "timeShift": null, + "title": "Disk IOps", + "tooltip": { + "shared": false, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "format": "iops", + "label": "IO read (-) / write (+)", + "logBase": 1, + "max": null, + "min": null, + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": false + } + ], + "yaxis": { + "align": false, + "alignLevel": null + } + }, + { + "aliasColors": { + "io time": "#890F02" + }, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "${DS_PROMETHEUS}", + "decimals": 3, + "description": "", + "fill": 4, + "fillGradient": 0, + "gridPos": { + "h": 12, + "w": 12, + "x": 12, + "y": 45 + }, + "hiddenSeries": false, + "id": 42, + "legend": { + "alignAsTable": true, + "avg": true, + "current": true, + "max": true, + "min": true, + "rightSide": false, + "show": true, + "sort": null, + "sortDesc": null, + "total": false, + "values": true + }, + "lines": true, + "linewidth": 1, + "links": [], + "maxPerRow": 6, + "nullPointMode": "null", + "options": { + "dataLinks": [] + }, + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [ + { + "alias": "/.*read*./", + "transform": "negative-Y" + }, + { + "alias": "/.*sda.*/", + "color": "#7EB26D" + }, + { + "alias": "/.*sdb.*/", + "color": "#EAB839" + }, + { + "alias": "/.*sdc.*/", + "color": "#6ED0E0" + }, + { + "alias": "/.*sdd.*/", + "color": "#EF843C" + }, + { + "alias": "/.*sde.*/", + "color": "#E24D42" + } + ], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "irate(node_disk_read_bytes_total{instance=\"$node\",job=\"$job\",device=~\"$diskdevices\"}[5m])", + "format": "time_series", + "hide": false, + "intervalFactor": 2, + "legendFormat": "{{device}} - Successfully read bytes", + "refId": "A", + "step": 240 + }, + { + "expr": "irate(node_disk_written_bytes_total{instance=\"$node\",job=\"$job\",device=~\"$diskdevices\"}[5m])", + "format": "time_series", + "hide": false, + "intervalFactor": 2, + "legendFormat": "{{device}} - Successfully written bytes", + "refId": "B", + "step": 240 + } + ], + "thresholds": [], + "timeFrom": null, + "timeRegions": [], + "timeShift": null, + "title": "I/O Usage Read / Write", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": false, + "values": [] + }, + "yaxes": [ + { + "format": "bytes", + "label": "bytes read (-) / write (+)", + "logBase": 1, + "max": null, + "min": null, + "show": true + }, + { + "format": "ms", + "label": "", + "logBase": 1, + "max": null, + "min": null, + "show": true + } + ], + "yaxis": { + "align": false, + "alignLevel": null + } + }, + { + "aliasColors": { + "io time": "#890F02" + }, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "${DS_PROMETHEUS}", + "decimals": 3, + "description": "", + "fill": 4, + "fillGradient": 0, + "gridPos": { + "h": 12, + "w": 12, + "x": 0, + "y": 57 + }, + "hiddenSeries": false, + "id": 127, + "legend": { + "alignAsTable": true, + "avg": true, + "current": true, + "max": true, + "min": true, + "rightSide": false, + "show": true, + "sort": null, + "sortDesc": null, + "total": false, + "values": true + }, + "lines": true, + "linewidth": 1, + "links": [], + "maxPerRow": 6, + "nullPointMode": "null", + "options": { + "dataLinks": [] + }, + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "irate(node_disk_io_time_seconds_total{instance=\"$node\",job=\"$job\",device=~\"$diskdevices\"} [5m])", + "format": "time_series", + "hide": false, + "intervalFactor": 2, + "legendFormat": "{{device}} - Time spent doing I/Os", + "refId": "A", + "step": 240 + } + ], + "thresholds": [], + "timeFrom": null, + "timeRegions": [], + "timeShift": null, + "title": "I/O Usage Times", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": false, + "values": [] + }, + "yaxes": [ + { + "format": "s", + "label": "time", + "logBase": 1, + "max": null, + "min": "0", + "show": true + }, + { + "format": "s", + "label": "", + "logBase": 1, + "max": null, + "min": null, + "show": false + } + ], + "yaxis": { + "align": false, + "alignLevel": null + } + } + ], + "repeat": null, + "title": "CPU / Memory / Net / Disk", + "type": "row" + }, + { + "collapsed": true, + "datasource": "${DS_PROMETHEUS}", + "gridPos": { + "h": 1, + "w": 24, + "x": 0, + "y": 21 + }, + "id": 266, + "panels": [ + { + "aliasColors": { + "Apps": "#629E51", + "Buffers": "#614D93", + "Cache": "#6D1F62", + "Cached": "#511749", + "Committed": "#508642", + "Free": "#0A437C", + "Hardware Corrupted - Amount of RAM that the kernel identified as corrupted / not working": "#CFFAFF", + "Inactive": "#584477", + "PageTables": "#0A50A1", + "Page_Tables": "#0A50A1", + "RAM_Free": "#E0F9D7", + "Slab": "#806EB7", + "Slab_Cache": "#E0752D", + "Swap": "#BF1B00", + "Swap_Cache": "#C15C17", + "Swap_Free": "#2F575E", + "Unused": "#EAB839" + }, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "${DS_PROMETHEUS}", + "decimals": 2, + "fill": 2, + "fillGradient": 0, + "gridPos": { + "h": 10, + "w": 12, + "x": 0, + "y": 70 + }, + "hiddenSeries": false, + "id": 136, + "legend": { + "alignAsTable": true, + "avg": true, + "current": true, + "max": true, + "min": true, + "rightSide": false, + "show": true, + "sideWidth": 350, + "total": false, + "values": true + }, + "lines": true, + "linewidth": 1, + "links": [], + "maxPerRow": 2, + "nullPointMode": "null", + "options": { + "dataLinks": [] + }, + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [], + "spaceLength": 10, + "stack": true, + "steppedLine": false, + "targets": [ + { + "expr": "node_memory_Inactive_bytes{instance=\"$node\",job=\"$job\"}", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "Inactive - Memory which has been less recently used. It is more eligible to be reclaimed for other purposes", + "refId": "A", + "step": 4 + }, + { + "expr": "node_memory_Active_bytes{instance=\"$node\",job=\"$job\"}", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "Active - Memory that has been used more recently and usually not reclaimed unless absolutely necessary", + "refId": "B", + "step": 4 + } + ], + "thresholds": [], + "timeFrom": null, + "timeRegions": [], + "timeShift": null, + "title": "Memory Active / Inactive", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "cumulative" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "format": "bytes", + "label": "bytes", + "logBase": 1, + "max": null, + "min": "0", + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": false + } + ], + "yaxis": { + "align": false, + "alignLevel": null + } + }, + { + "aliasColors": { + "Apps": "#629E51", + "Buffers": "#614D93", + "Cache": "#6D1F62", + "Cached": "#511749", + "Committed": "#508642", + "Free": "#0A437C", + "Hardware Corrupted - Amount of RAM that the kernel identified as corrupted / not working": "#CFFAFF", + "Inactive": "#584477", + "PageTables": "#0A50A1", + "Page_Tables": "#0A50A1", + "RAM_Free": "#E0F9D7", + "Slab": "#806EB7", + "Slab_Cache": "#E0752D", + "Swap": "#BF1B00", + "Swap_Cache": "#C15C17", + "Swap_Free": "#2F575E", + "Unused": "#EAB839" + }, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "${DS_PROMETHEUS}", + "decimals": 2, + "fill": 2, + "fillGradient": 0, + "gridPos": { + "h": 10, + "w": 12, + "x": 12, + "y": 70 + }, + "hiddenSeries": false, + "id": 135, + "legend": { + "alignAsTable": true, + "avg": true, + "current": true, + "max": true, + "min": true, + "rightSide": false, + "show": true, + "sideWidth": 350, + "total": false, + "values": true + }, + "lines": true, + "linewidth": 1, + "links": [], + "maxPerRow": 6, + "nullPointMode": "null", + "options": { + "dataLinks": [] + }, + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [ + { + "alias": "/.*Committed_AS - *./" + }, + { + "alias": "/.*CommitLimit - *./", + "color": "#BF1B00", + "fill": 0 + } + ], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "node_memory_Committed_AS_bytes{instance=\"$node\",job=\"$job\"}", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "Committed_AS - Amount of memory presently allocated on the system", + "refId": "A", + "step": 4 + }, + { + "expr": "node_memory_CommitLimit_bytes{instance=\"$node\",job=\"$job\"}", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "CommitLimit - Amount of memory currently available to be allocated on the system", + "refId": "B", + "step": 4 + } + ], + "thresholds": [], + "timeFrom": null, + "timeRegions": [], + "timeShift": null, + "title": "Memory Commited", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "cumulative" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "format": "bytes", + "label": "bytes", + "logBase": 1, + "max": null, + "min": "0", + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": false + } + ], + "yaxis": { + "align": false, + "alignLevel": null + } + }, + { + "aliasColors": { + "Apps": "#629E51", + "Buffers": "#614D93", + "Cache": "#6D1F62", + "Cached": "#511749", + "Committed": "#508642", + "Free": "#0A437C", + "Hardware Corrupted - Amount of RAM that the kernel identified as corrupted / not working": "#CFFAFF", + "Inactive": "#584477", + "PageTables": "#0A50A1", + "Page_Tables": "#0A50A1", + "RAM_Free": "#E0F9D7", + "Slab": "#806EB7", + "Slab_Cache": "#E0752D", + "Swap": "#BF1B00", + "Swap_Cache": "#C15C17", + "Swap_Free": "#2F575E", + "Unused": "#EAB839" + }, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "${DS_PROMETHEUS}", + "decimals": 2, + "fill": 2, + "fillGradient": 0, + "gridPos": { + "h": 10, + "w": 12, + "x": 0, + "y": 80 + }, + "hiddenSeries": false, + "id": 191, + "legend": { + "alignAsTable": true, + "avg": true, + "current": true, + "max": true, + "min": true, + "rightSide": false, + "show": true, + "sideWidth": 350, + "total": false, + "values": true + }, + "lines": true, + "linewidth": 1, + "links": [], + "maxPerRow": 6, + "nullPointMode": "null", + "options": { + "dataLinks": [] + }, + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [], + "spaceLength": 10, + "stack": true, + "steppedLine": false, + "targets": [ + { + "expr": "node_memory_Inactive_file_bytes{instance=\"$node\",job=\"$job\"}", + "format": "time_series", + "hide": false, + "intervalFactor": 2, + "legendFormat": "Inactive_file - File-backed memory on inactive LRU list", + "refId": "A", + "step": 4 + }, + { + "expr": "node_memory_Inactive_anon_bytes{instance=\"$node\",job=\"$job\"}", + "format": "time_series", + "hide": false, + "intervalFactor": 2, + "legendFormat": "Inactive_anon - Anonymous and swap cache on inactive LRU list, including tmpfs (shmem)", + "refId": "B", + "step": 4 + }, + { + "expr": "node_memory_Active_file_bytes{instance=\"$node\",job=\"$job\"}", + "format": "time_series", + "hide": false, + "intervalFactor": 2, + "legendFormat": "Active_file - File-backed memory on active LRU list", + "refId": "C", + "step": 4 + }, + { + "expr": "node_memory_Active_anon_bytes{instance=\"$node\",job=\"$job\"}", + "format": "time_series", + "hide": false, + "intervalFactor": 2, + "legendFormat": "Active_anon - Anonymous and swap cache on active least-recently-used (LRU) list, including tmpfs", + "refId": "D", + "step": 4 + } + ], + "thresholds": [], + "timeFrom": null, + "timeRegions": [], + "timeShift": null, + "title": "Memory Active / Inactive Detail", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "cumulative" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "format": "bytes", + "label": "bytes", + "logBase": 1, + "max": null, + "min": "0", + "show": true + }, + { + "format": "bytes", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + } + ], + "yaxis": { + "align": false, + "alignLevel": null + } + }, + { + "aliasColors": { + "Active": "#99440A", + "Buffers": "#58140C", + "Cache": "#6D1F62", + "Cached": "#511749", + "Committed": "#508642", + "Dirty": "#6ED0E0", + "Free": "#B7DBAB", + "Inactive": "#EA6460", + "Mapped": "#052B51", + "PageTables": "#0A50A1", + "Page_Tables": "#0A50A1", + "Slab_Cache": "#EAB839", + "Swap": "#BF1B00", + "Swap_Cache": "#C15C17", + "Total": "#511749", + "Total RAM": "#052B51", + "Total RAM + Swap": "#052B51", + "Total Swap": "#614D93", + "VmallocUsed": "#EA6460" + }, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "${DS_PROMETHEUS}", + "decimals": 2, + "fill": 2, + "fillGradient": 0, + "gridPos": { + "h": 10, + "w": 12, + "x": 12, + "y": 80 + }, + "hiddenSeries": false, + "id": 130, + "legend": { + "alignAsTable": true, + "avg": true, + "current": true, + "max": true, + "min": true, + "rightSide": false, + "show": true, + "sideWidth": null, + "total": false, + "values": true + }, + "lines": true, + "linewidth": 1, + "links": [], + "maxPerRow": 2, + "nullPointMode": "null", + "options": { + "dataLinks": [] + }, + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "node_memory_Writeback_bytes{instance=\"$node\",job=\"$job\"}", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "Writeback - Memory which is actively being written back to disk", + "refId": "A", + "step": 4 + }, + { + "expr": "node_memory_WritebackTmp_bytes{instance=\"$node\",job=\"$job\"}", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "WritebackTmp - Memory used by FUSE for temporary writeback buffers", + "refId": "B", + "step": 4 + }, + { + "expr": "node_memory_Dirty_bytes{instance=\"$node\",job=\"$job\"}", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "Dirty - Memory which is waiting to get written back to the disk", + "refId": "C", + "step": 4 + } + ], + "thresholds": [], + "timeFrom": null, + "timeRegions": [], + "timeShift": null, + "title": "Memory Writeback and Dirty", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "format": "bytes", + "label": "bytes", + "logBase": 1, + "max": null, + "min": "0", + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": false + } + ], + "yaxis": { + "align": false, + "alignLevel": null + } + }, + { + "aliasColors": { + "Apps": "#629E51", + "Buffers": "#614D93", + "Cache": "#6D1F62", + "Cached": "#511749", + "Committed": "#508642", + "Free": "#0A437C", + "Hardware Corrupted - Amount of RAM that the kernel identified as corrupted / not working": "#CFFAFF", + "Inactive": "#584477", + "PageTables": "#0A50A1", + "Page_Tables": "#0A50A1", + "RAM_Free": "#E0F9D7", + "Slab": "#806EB7", + "Slab_Cache": "#E0752D", + "Swap": "#BF1B00", + "Swap_Cache": "#C15C17", + "Swap_Free": "#2F575E", + "Unused": "#EAB839" + }, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "${DS_PROMETHEUS}", + "decimals": 2, + "fill": 2, + "fillGradient": 0, + "gridPos": { + "h": 10, + "w": 12, + "x": 0, + "y": 90 + }, + "hiddenSeries": false, + "id": 138, + "legend": { + "alignAsTable": true, + "avg": true, + "current": true, + "max": true, + "min": true, + "rightSide": false, + "show": true, + "sideWidth": 350, + "total": false, + "values": true + }, + "lines": true, + "linewidth": 1, + "links": [], + "maxPerRow": 6, + "nullPointMode": "null", + "options": { + "dataLinks": [] + }, + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [ + { + "$$hashKey": "object:4131", + "alias": "ShmemHugePages - Memory used by shared memory (shmem) and tmpfs allocated with huge pages", + "fill": 0 + }, + { + "$$hashKey": "object:4138", + "alias": "ShmemHugePages - Memory used by shared memory (shmem) and tmpfs allocated with huge pages", + "fill": 0 + } + ], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "node_memory_Mapped_bytes{instance=\"$node\",job=\"$job\"}", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "Mapped - Used memory in mapped pages files which have been mmaped, such as libraries", + "refId": "A", + "step": 4 + }, + { + "expr": "node_memory_Shmem_bytes{instance=\"$node\",job=\"$job\"}", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "Shmem - Used shared memory (shared between several processes, thus including RAM disks)", + "refId": "B", + "step": 4 + }, + { + "expr": "node_memory_ShmemHugePages_bytes{instance=\"$node\",job=\"$job\"}", + "format": "time_series", + "interval": "", + "intervalFactor": 2, + "legendFormat": "ShmemHugePages - Memory used by shared memory (shmem) and tmpfs allocated with huge pages", + "refId": "C", + "step": 4 + }, + { + "expr": "node_memory_ShmemPmdMapped_bytes{instance=\"$node\",job=\"$job\"}", + "format": "time_series", + "interval": "", + "intervalFactor": 2, + "legendFormat": "ShmemPmdMapped - Ammount of shared (shmem/tmpfs) memory backed by huge pages", + "refId": "D", + "step": 4 + } + ], + "thresholds": [], + "timeFrom": null, + "timeRegions": [], + "timeShift": null, + "title": "Memory Shared and Mapped", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "cumulative" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "$$hashKey": "object:4106", + "format": "bytes", + "label": "bytes", + "logBase": 1, + "max": null, + "min": "0", + "show": true + }, + { + "$$hashKey": "object:4107", + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": false + } + ], + "yaxis": { + "align": false, + "alignLevel": null + } + }, + { + "aliasColors": { + "Active": "#99440A", + "Buffers": "#58140C", + "Cache": "#6D1F62", + "Cached": "#511749", + "Committed": "#508642", + "Dirty": "#6ED0E0", + "Free": "#B7DBAB", + "Inactive": "#EA6460", + "Mapped": "#052B51", + "PageTables": "#0A50A1", + "Page_Tables": "#0A50A1", + "Slab_Cache": "#EAB839", + "Swap": "#BF1B00", + "Swap_Cache": "#C15C17", + "Total": "#511749", + "Total RAM": "#052B51", + "Total RAM + Swap": "#052B51", + "Total Swap": "#614D93", + "VmallocUsed": "#EA6460" + }, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "${DS_PROMETHEUS}", + "decimals": 2, + "fill": 2, + "fillGradient": 0, + "gridPos": { + "h": 10, + "w": 12, + "x": 12, + "y": 90 + }, + "hiddenSeries": false, + "id": 131, + "legend": { + "alignAsTable": true, + "avg": true, + "current": true, + "max": true, + "min": true, + "rightSide": false, + "show": true, + "sideWidth": null, + "total": false, + "values": true + }, + "lines": true, + "linewidth": 1, + "links": [], + "maxPerRow": 2, + "nullPointMode": "null", + "options": { + "dataLinks": [] + }, + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [], + "spaceLength": 10, + "stack": true, + "steppedLine": false, + "targets": [ + { + "expr": "node_memory_SUnreclaim_bytes{instance=\"$node\",job=\"$job\"}", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "SUnreclaim - Part of Slab, that cannot be reclaimed on memory pressure", + "refId": "A", + "step": 4 + }, + { + "expr": "node_memory_SReclaimable_bytes{instance=\"$node\",job=\"$job\"}", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "SReclaimable - Part of Slab, that might be reclaimed, such as caches", + "refId": "B", + "step": 4 + } + ], + "thresholds": [], + "timeFrom": null, + "timeRegions": [], + "timeShift": null, + "title": "Memory Slab", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "format": "bytes", + "label": "bytes", + "logBase": 1, + "max": null, + "min": "0", + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": false + } + ], + "yaxis": { + "align": false, + "alignLevel": null + } + }, + { + "aliasColors": { + "Active": "#99440A", + "Buffers": "#58140C", + "Cache": "#6D1F62", + "Cached": "#511749", + "Committed": "#508642", + "Dirty": "#6ED0E0", + "Free": "#B7DBAB", + "Inactive": "#EA6460", + "Mapped": "#052B51", + "PageTables": "#0A50A1", + "Page_Tables": "#0A50A1", + "Slab_Cache": "#EAB839", + "Swap": "#BF1B00", + "Swap_Cache": "#C15C17", + "Total": "#511749", + "Total RAM": "#052B51", + "Total RAM + Swap": "#052B51", + "VmallocUsed": "#EA6460" + }, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "${DS_PROMETHEUS}", + "decimals": 2, + "fill": 2, + "fillGradient": 0, + "gridPos": { + "h": 10, + "w": 12, + "x": 0, + "y": 100 + }, + "hiddenSeries": false, + "id": 70, + "legend": { + "alignAsTable": true, + "avg": true, + "current": true, + "max": true, + "min": true, + "rightSide": false, + "show": true, + "sideWidth": null, + "total": false, + "values": true + }, + "lines": true, + "linewidth": 1, + "links": [], + "maxPerRow": 6, + "nullPointMode": "null", + "options": { + "dataLinks": [] + }, + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "node_memory_VmallocChunk_bytes{instance=\"$node\",job=\"$job\"}", + "format": "time_series", + "hide": false, + "intervalFactor": 2, + "legendFormat": "VmallocChunk - Largest contigious block of vmalloc area which is free", + "refId": "A", + "step": 4 + }, + { + "expr": "node_memory_VmallocTotal_bytes{instance=\"$node\",job=\"$job\"}", + "format": "time_series", + "hide": false, + "intervalFactor": 2, + "legendFormat": "VmallocTotal - Total size of vmalloc memory area", + "refId": "B", + "step": 4 + }, + { + "expr": "node_memory_VmallocUsed_bytes{instance=\"$node\",job=\"$job\"}", + "format": "time_series", + "hide": false, + "intervalFactor": 2, + "legendFormat": "VmallocUsed - Amount of vmalloc area which is used", + "refId": "C", + "step": 4 + } + ], + "thresholds": [], + "timeFrom": null, + "timeRegions": [], + "timeShift": null, + "title": "Memory Vmalloc", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "format": "bytes", + "label": "bytes", + "logBase": 1, + "max": null, + "min": "0", + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": false + } + ], + "yaxis": { + "align": false, + "alignLevel": null + } + }, + { + "aliasColors": { + "Apps": "#629E51", + "Buffers": "#614D93", + "Cache": "#6D1F62", + "Cached": "#511749", + "Committed": "#508642", + "Free": "#0A437C", + "Hardware Corrupted - Amount of RAM that the kernel identified as corrupted / not working": "#CFFAFF", + "Inactive": "#584477", + "PageTables": "#0A50A1", + "Page_Tables": "#0A50A1", + "RAM_Free": "#E0F9D7", + "Slab": "#806EB7", + "Slab_Cache": "#E0752D", + "Swap": "#BF1B00", + "Swap_Cache": "#C15C17", + "Swap_Free": "#2F575E", + "Unused": "#EAB839" + }, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "${DS_PROMETHEUS}", + "decimals": 2, + "fill": 2, + "fillGradient": 0, + "gridPos": { + "h": 10, + "w": 12, + "x": 12, + "y": 100 + }, + "hiddenSeries": false, + "id": 159, + "legend": { + "alignAsTable": true, + "avg": true, + "current": true, + "max": true, + "min": true, + "rightSide": false, + "show": true, + "sideWidth": 350, + "total": false, + "values": true + }, + "lines": true, + "linewidth": 1, + "links": [], + "maxPerRow": 6, + "nullPointMode": "null", + "options": { + "dataLinks": [] + }, + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "node_memory_Bounce_bytes{instance=\"$node\",job=\"$job\"}", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "Bounce - Memory used for block device bounce buffers", + "refId": "A", + "step": 4 + } + ], + "thresholds": [], + "timeFrom": null, + "timeRegions": [], + "timeShift": null, + "title": "Memory Bounce", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "cumulative" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "format": "bytes", + "label": "bytes", + "logBase": 1, + "max": null, + "min": "0", + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": false + } + ], + "yaxis": { + "align": false, + "alignLevel": null + } + }, + { + "aliasColors": { + "Active": "#99440A", + "Buffers": "#58140C", + "Cache": "#6D1F62", + "Cached": "#511749", + "Committed": "#508642", + "Dirty": "#6ED0E0", + "Free": "#B7DBAB", + "Inactive": "#EA6460", + "Mapped": "#052B51", + "PageTables": "#0A50A1", + "Page_Tables": "#0A50A1", + "Slab_Cache": "#EAB839", + "Swap": "#BF1B00", + "Swap_Cache": "#C15C17", + "Total": "#511749", + "Total RAM": "#052B51", + "Total RAM + Swap": "#052B51", + "VmallocUsed": "#EA6460" + }, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "${DS_PROMETHEUS}", + "decimals": 2, + "fill": 2, + "fillGradient": 0, + "gridPos": { + "h": 10, + "w": 12, + "x": 0, + "y": 110 + }, + "hiddenSeries": false, + "id": 129, + "legend": { + "alignAsTable": true, + "avg": true, + "current": true, + "max": true, + "min": true, + "rightSide": false, + "show": true, + "sideWidth": null, + "total": false, + "values": true + }, + "lines": true, + "linewidth": 1, + "links": [], + "maxPerRow": 6, + "nullPointMode": "null", + "options": { + "dataLinks": [] + }, + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [ + { + "alias": "/.*Inactive *./", + "transform": "negative-Y" + } + ], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "node_memory_AnonHugePages_bytes{instance=\"$node\",job=\"$job\"}", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "AnonHugePages - Memory in anonymous huge pages", + "refId": "A", + "step": 4 + }, + { + "expr": "node_memory_AnonPages_bytes{instance=\"$node\",job=\"$job\"}", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "AnonPages - Memory in user pages not backed by files", + "refId": "B", + "step": 4 + } + ], + "thresholds": [], + "timeFrom": null, + "timeRegions": [], + "timeShift": null, + "title": "Memory Anonymous", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "format": "bytes", + "label": "bytes", + "logBase": 1, + "max": null, + "min": "0", + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": false + } + ], + "yaxis": { + "align": false, + "alignLevel": null + } + }, + { + "aliasColors": { + "Apps": "#629E51", + "Buffers": "#614D93", + "Cache": "#6D1F62", + "Cached": "#511749", + "Committed": "#508642", + "Free": "#0A437C", + "Hardware Corrupted - Amount of RAM that the kernel identified as corrupted / not working": "#CFFAFF", + "Inactive": "#584477", + "PageTables": "#0A50A1", + "Page_Tables": "#0A50A1", + "RAM_Free": "#E0F9D7", + "Slab": "#806EB7", + "Slab_Cache": "#E0752D", + "Swap": "#BF1B00", + "Swap_Cache": "#C15C17", + "Swap_Free": "#2F575E", + "Unused": "#EAB839" + }, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "${DS_PROMETHEUS}", + "decimals": 2, + "fill": 2, + "fillGradient": 0, + "gridPos": { + "h": 10, + "w": 12, + "x": 12, + "y": 110 + }, + "hiddenSeries": false, + "id": 160, + "legend": { + "alignAsTable": true, + "avg": true, + "current": true, + "max": true, + "min": true, + "rightSide": false, + "show": true, + "sideWidth": 350, + "total": false, + "values": true + }, + "lines": true, + "linewidth": 1, + "links": [], + "maxPerRow": 2, + "nullPointMode": "null", + "options": { + "dataLinks": [] + }, + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "node_memory_KernelStack_bytes{instance=\"$node\",job=\"$job\"}", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "KernelStack - Kernel memory stack. This is not reclaimable", + "refId": "A", + "step": 4 + }, + { + "expr": "node_memory_Percpu_bytes{instance=\"$node\",job=\"$job\"}", + "format": "time_series", + "interval": "", + "intervalFactor": 2, + "legendFormat": "PerCPU - Per CPU memory allocated dynamically by loadable modules", + "refId": "B", + "step": 4 + } + ], + "thresholds": [], + "timeFrom": null, + "timeRegions": [], + "timeShift": null, + "title": "Memory Kernel / CPU", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "cumulative" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "format": "bytes", + "label": "bytes", + "logBase": 1, + "max": null, + "min": "0", + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": false + } + ], + "yaxis": { + "align": false, + "alignLevel": null + } + }, + { + "aliasColors": { + "Active": "#99440A", + "Buffers": "#58140C", + "Cache": "#6D1F62", + "Cached": "#511749", + "Committed": "#508642", + "Dirty": "#6ED0E0", + "Free": "#B7DBAB", + "Inactive": "#EA6460", + "Mapped": "#052B51", + "PageTables": "#0A50A1", + "Page_Tables": "#0A50A1", + "Slab_Cache": "#EAB839", + "Swap": "#BF1B00", + "Swap_Cache": "#C15C17", + "Total": "#511749", + "Total RAM": "#806EB7", + "Total RAM + Swap": "#806EB7", + "VmallocUsed": "#EA6460" + }, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "${DS_PROMETHEUS}", + "decimals": 2, + "fill": 2, + "fillGradient": 0, + "gridPos": { + "h": 10, + "w": 12, + "x": 0, + "y": 120 + }, + "hiddenSeries": false, + "id": 140, + "legend": { + "alignAsTable": true, + "avg": false, + "current": true, + "max": true, + "min": true, + "rightSide": false, + "show": true, + "sideWidth": null, + "total": false, + "values": true + }, + "lines": true, + "linewidth": 1, + "links": [], + "maxPerRow": 6, + "nullPointMode": "null", + "options": { + "dataLinks": [] + }, + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "node_memory_HugePages_Free{instance=\"$node\",job=\"$job\"}", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "HugePages_Free - Huge pages in the pool that are not yet allocated", + "refId": "A", + "step": 4 + }, + { + "expr": "node_memory_HugePages_Rsvd{instance=\"$node\",job=\"$job\"}", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "HugePages_Rsvd - Huge pages for which a commitment to allocate from the pool has been made, but no allocation has yet been made", + "refId": "B", + "step": 4 + }, + { + "expr": "node_memory_HugePages_Surp{instance=\"$node\",job=\"$job\"}", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "HugePages_Surp - Huge pages in the pool above the value in /proc/sys/vm/nr_hugepages", + "refId": "C", + "step": 4 + } + ], + "thresholds": [], + "timeFrom": null, + "timeRegions": [], + "timeShift": null, + "title": "Memory HugePages Counter", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "format": "short", + "label": "pages", + "logBase": 1, + "max": null, + "min": "0", + "show": true + }, + { + "format": "short", + "label": "", + "logBase": 1, + "max": null, + "min": null, + "show": false + } + ], + "yaxis": { + "align": false, + "alignLevel": null + } + }, + { + "aliasColors": { + "Active": "#99440A", + "Buffers": "#58140C", + "Cache": "#6D1F62", + "Cached": "#511749", + "Committed": "#508642", + "Dirty": "#6ED0E0", + "Free": "#B7DBAB", + "Inactive": "#EA6460", + "Mapped": "#052B51", + "PageTables": "#0A50A1", + "Page_Tables": "#0A50A1", + "Slab_Cache": "#EAB839", + "Swap": "#BF1B00", + "Swap_Cache": "#C15C17", + "Total": "#511749", + "Total RAM": "#806EB7", + "Total RAM + Swap": "#806EB7", + "VmallocUsed": "#EA6460" + }, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "${DS_PROMETHEUS}", + "decimals": 2, + "fill": 2, + "fillGradient": 0, + "gridPos": { + "h": 10, + "w": 12, + "x": 12, + "y": 120 + }, + "hiddenSeries": false, + "id": 71, + "legend": { + "alignAsTable": true, + "avg": false, + "current": true, + "max": true, + "min": true, + "rightSide": false, + "show": true, + "sideWidth": null, + "total": false, + "values": true + }, + "lines": true, + "linewidth": 1, + "links": [], + "maxPerRow": 2, + "nullPointMode": "null", + "options": { + "dataLinks": [] + }, + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "node_memory_HugePages_Total{instance=\"$node\",job=\"$job\"}", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "HugePages - Total size of the pool of huge pages", + "refId": "A", + "step": 4 + }, + { + "expr": "node_memory_Hugepagesize_bytes{instance=\"$node\",job=\"$job\"}", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "Hugepagesize - Huge Page size", + "refId": "B", + "step": 4 + } + ], + "thresholds": [], + "timeFrom": null, + "timeRegions": [], + "timeShift": null, + "title": "Memory HugePages Size", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "format": "bytes", + "label": "bytes", + "logBase": 1, + "max": null, + "min": "0", + "show": true + }, + { + "format": "short", + "label": "", + "logBase": 1, + "max": null, + "min": null, + "show": false + } + ], + "yaxis": { + "align": false, + "alignLevel": null + } + }, + { + "aliasColors": { + "Active": "#99440A", + "Buffers": "#58140C", + "Cache": "#6D1F62", + "Cached": "#511749", + "Committed": "#508642", + "Dirty": "#6ED0E0", + "Free": "#B7DBAB", + "Inactive": "#EA6460", + "Mapped": "#052B51", + "PageTables": "#0A50A1", + "Page_Tables": "#0A50A1", + "Slab_Cache": "#EAB839", + "Swap": "#BF1B00", + "Swap_Cache": "#C15C17", + "Total": "#511749", + "Total RAM": "#052B51", + "Total RAM + Swap": "#052B51", + "VmallocUsed": "#EA6460" + }, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "${DS_PROMETHEUS}", + "decimals": 2, + "fill": 2, + "fillGradient": 0, + "gridPos": { + "h": 10, + "w": 12, + "x": 0, + "y": 130 + }, + "hiddenSeries": false, + "id": 128, + "legend": { + "alignAsTable": true, + "avg": true, + "current": false, + "hideEmpty": false, + "hideZero": false, + "max": true, + "min": true, + "rightSide": false, + "show": true, + "sideWidth": null, + "total": false, + "values": true + }, + "lines": true, + "linewidth": 1, + "links": [], + "maxPerRow": 6, + "nullPointMode": "null", + "options": { + "dataLinks": [] + }, + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "node_memory_DirectMap1G_bytes{instance=\"$node\",job=\"$job\"}", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "DirectMap1G - Amount of pages mapped as this size", + "refId": "A", + "step": 4 + }, + { + "expr": "node_memory_DirectMap2M_bytes{instance=\"$node\",job=\"$job\"}", + "format": "time_series", + "interval": "", + "intervalFactor": 2, + "legendFormat": "DirectMap2M - Amount of pages mapped as this size", + "refId": "B", + "step": 4 + }, + { + "expr": "node_memory_DirectMap4k_bytes{instance=\"$node\",job=\"$job\"}", + "format": "time_series", + "interval": "", + "intervalFactor": 2, + "legendFormat": "DirectMap4K - Amount of pages mapped as this size", + "refId": "C", + "step": 4 + } + ], + "thresholds": [], + "timeFrom": null, + "timeRegions": [], + "timeShift": null, + "title": "Memory DirectMap", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "format": "bytes", + "label": "bytes", + "logBase": 1, + "max": null, + "min": "0", + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": false + } + ], + "yaxis": { + "align": false, + "alignLevel": null + } + }, + { + "aliasColors": { + "Apps": "#629E51", + "Buffers": "#614D93", + "Cache": "#6D1F62", + "Cached": "#511749", + "Committed": "#508642", + "Free": "#0A437C", + "Hardware Corrupted - Amount of RAM that the kernel identified as corrupted / not working": "#CFFAFF", + "Inactive": "#584477", + "PageTables": "#0A50A1", + "Page_Tables": "#0A50A1", + "RAM_Free": "#E0F9D7", + "Slab": "#806EB7", + "Slab_Cache": "#E0752D", + "Swap": "#BF1B00", + "Swap_Cache": "#C15C17", + "Swap_Free": "#2F575E", + "Unused": "#EAB839" + }, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "${DS_PROMETHEUS}", + "decimals": 2, + "fill": 2, + "fillGradient": 0, + "gridPos": { + "h": 10, + "w": 12, + "x": 12, + "y": 130 + }, + "hiddenSeries": false, + "id": 137, + "legend": { + "alignAsTable": true, + "avg": true, + "current": true, + "max": true, + "min": true, + "rightSide": false, + "show": true, + "sideWidth": 350, + "total": false, + "values": true + }, + "lines": true, + "linewidth": 1, + "links": [], + "maxPerRow": 6, + "nullPointMode": "null", + "options": { + "dataLinks": [] + }, + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "node_memory_Unevictable_bytes{instance=\"$node\",job=\"$job\"}", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "Unevictable - Amount of unevictable memory that can't be swapped out for a variety of reasons", + "refId": "A", + "step": 4 + }, + { + "expr": "node_memory_Mlocked_bytes{instance=\"$node\",job=\"$job\"}", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "MLocked - Size of pages locked to memory using the mlock() system call", + "refId": "B", + "step": 4 + } + ], + "thresholds": [], + "timeFrom": null, + "timeRegions": [], + "timeShift": null, + "title": "Memory Unevictable and MLocked", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "cumulative" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "format": "bytes", + "label": "bytes", + "logBase": 1, + "max": null, + "min": "0", + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": false + } + ], + "yaxis": { + "align": false, + "alignLevel": null + } + }, + { + "aliasColors": { + "Active": "#99440A", + "Buffers": "#58140C", + "Cache": "#6D1F62", + "Cached": "#511749", + "Committed": "#508642", + "Dirty": "#6ED0E0", + "Free": "#B7DBAB", + "Inactive": "#EA6460", + "Mapped": "#052B51", + "PageTables": "#0A50A1", + "Page_Tables": "#0A50A1", + "Slab_Cache": "#EAB839", + "Swap": "#BF1B00", + "Swap_Cache": "#C15C17", + "Total": "#511749", + "Total RAM": "#052B51", + "Total RAM + Swap": "#052B51", + "Total Swap": "#614D93", + "VmallocUsed": "#EA6460" + }, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "${DS_PROMETHEUS}", + "decimals": 2, + "fill": 2, + "fillGradient": 0, + "gridPos": { + "h": 10, + "w": 12, + "x": 0, + "y": 140 + }, + "hiddenSeries": false, + "id": 132, + "legend": { + "alignAsTable": true, + "avg": true, + "current": true, + "max": true, + "min": true, + "rightSide": false, + "show": true, + "sideWidth": null, + "total": false, + "values": true + }, + "lines": true, + "linewidth": 1, + "links": [], + "maxPerRow": 6, + "nullPointMode": "null", + "options": { + "dataLinks": [] + }, + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "node_memory_NFS_Unstable_bytes{instance=\"$node\",job=\"$job\"}", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "NFS Unstable - Memory in NFS pages sent to the server, but not yet commited to the storage", + "refId": "A", + "step": 4 + } + ], + "thresholds": [], + "timeFrom": null, + "timeRegions": [], + "timeShift": null, + "title": "Memory NFS", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "format": "bytes", + "label": "bytes", + "logBase": 1, + "max": null, + "min": "0", + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": false + } + ], + "yaxis": { + "align": false, + "alignLevel": null + } + } + ], + "repeat": null, + "title": "Memory Meminfo", + "type": "row" + }, + { + "collapsed": true, + "datasource": "${DS_PROMETHEUS}", + "gridPos": { + "h": 1, + "w": 24, + "x": 0, + "y": 22 + }, + "id": 267, + "panels": [ + { + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "${DS_PROMETHEUS}", + "fill": 2, + "fillGradient": 0, + "gridPos": { + "h": 10, + "w": 12, + "x": 0, + "y": 23 + }, + "hiddenSeries": false, + "id": 176, + "legend": { + "alignAsTable": true, + "avg": true, + "current": true, + "max": true, + "min": true, + "rightSide": false, + "show": true, + "total": false, + "values": true + }, + "lines": true, + "linewidth": 1, + "links": [], + "maxPerRow": 6, + "nullPointMode": "null", + "options": { + "dataLinks": [] + }, + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [ + { + "alias": "/.*out/", + "transform": "negative-Y" + } + ], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "irate(node_vmstat_pgpgin{instance=\"$node\",job=\"$job\"}[5m])", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "Pagesin - Page in operations", + "refId": "A", + "step": 4 + }, + { + "expr": "irate(node_vmstat_pgpgout{instance=\"$node\",job=\"$job\"}[5m])", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "Pagesout - Page out operations", + "refId": "B", + "step": 4 + } + ], + "thresholds": [], + "timeFrom": null, + "timeRegions": [], + "timeShift": null, + "title": "Memory Pages In / Out", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "format": "short", + "label": "pages out (-) / in (+)", + "logBase": 1, + "max": null, + "min": null, + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": false + } + ], + "yaxis": { + "align": false, + "alignLevel": null + } + }, + { + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "${DS_PROMETHEUS}", + "fill": 2, + "fillGradient": 0, + "gridPos": { + "h": 10, + "w": 12, + "x": 12, + "y": 23 + }, + "hiddenSeries": false, + "id": 22, + "legend": { + "alignAsTable": true, + "avg": true, + "current": true, + "max": true, + "min": true, + "rightSide": false, + "show": true, + "total": false, + "values": true + }, + "lines": true, + "linewidth": 1, + "links": [], + "maxPerRow": 6, + "nullPointMode": "null", + "options": { + "dataLinks": [] + }, + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [ + { + "alias": "/.*out/", + "transform": "negative-Y" + } + ], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "irate(node_vmstat_pswpin{instance=\"$node\",job=\"$job\"}[5m])", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "Pswpin - Pages swapped in", + "refId": "A", + "step": 4 + }, + { + "expr": "irate(node_vmstat_pswpout{instance=\"$node\",job=\"$job\"}[5m])", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "Pswpout - Pages swapped out", + "refId": "B", + "step": 4 + } + ], + "thresholds": [], + "timeFrom": null, + "timeRegions": [], + "timeShift": null, + "title": "Memory Pages Swap In / Out", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "format": "short", + "label": "pages out (-) / in (+)", + "logBase": 1, + "max": null, + "min": null, + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": false + } + ], + "yaxis": { + "align": false, + "alignLevel": null + } + }, + { + "aliasColors": { + "Apps": "#629E51", + "Buffers": "#614D93", + "Cache": "#6D1F62", + "Cached": "#511749", + "Committed": "#508642", + "Free": "#0A437C", + "Hardware Corrupted - Amount of RAM that the kernel identified as corrupted / not working": "#CFFAFF", + "Inactive": "#584477", + "PageTables": "#0A50A1", + "Page_Tables": "#0A50A1", + "RAM_Free": "#E0F9D7", + "Slab": "#806EB7", + "Slab_Cache": "#E0752D", + "Swap": "#BF1B00", + "Swap_Cache": "#C15C17", + "Swap_Free": "#2F575E", + "Unused": "#EAB839" + }, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "${DS_PROMETHEUS}", + "decimals": 2, + "fill": 2, + "fillGradient": 0, + "gridPos": { + "h": 10, + "w": 12, + "x": 0, + "y": 33 + }, + "hiddenSeries": false, + "id": 175, + "legend": { + "alignAsTable": true, + "avg": true, + "current": true, + "max": true, + "min": true, + "rightSide": false, + "show": true, + "sideWidth": 350, + "total": false, + "values": true + }, + "lines": true, + "linewidth": 1, + "links": [], + "maxPerRow": 6, + "nullPointMode": "null", + "options": { + "dataLinks": [] + }, + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [ + { + "$$hashKey": "object:6118", + "alias": "Pgfault - Page major and minor fault operations", + "fill": 0, + "stack": false + } + ], + "spaceLength": 10, + "stack": true, + "steppedLine": false, + "targets": [ + { + "expr": "irate(node_vmstat_pgfault{instance=\"$node\",job=\"$job\"}[5m])", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "Pgfault - Page major and minor fault operations", + "refId": "A", + "step": 4 + }, + { + "expr": "irate(node_vmstat_pgmajfault{instance=\"$node\",job=\"$job\"}[5m])", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "Pgmajfault - Major page fault operations", + "refId": "B", + "step": 4 + }, + { + "expr": "irate(node_vmstat_pgfault{instance=\"$node\",job=\"$job\"}[5m]) - irate(node_vmstat_pgmajfault{instance=\"$node\",job=\"$job\"}[5m])", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "Pgminfault - Minor page fault operations", + "refId": "C", + "step": 4 + } + ], + "thresholds": [], + "timeFrom": null, + "timeRegions": [], + "timeShift": null, + "title": "Memory Page Faults", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "cumulative" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "$$hashKey": "object:6133", + "format": "short", + "label": "faults", + "logBase": 1, + "max": null, + "min": "0", + "show": true + }, + { + "$$hashKey": "object:6134", + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": false + } + ], + "yaxis": { + "align": false, + "alignLevel": null + } + }, + { + "aliasColors": { + "Active": "#99440A", + "Buffers": "#58140C", + "Cache": "#6D1F62", + "Cached": "#511749", + "Committed": "#508642", + "Dirty": "#6ED0E0", + "Free": "#B7DBAB", + "Inactive": "#EA6460", + "Mapped": "#052B51", + "PageTables": "#0A50A1", + "Page_Tables": "#0A50A1", + "Slab_Cache": "#EAB839", + "Swap": "#BF1B00", + "Swap_Cache": "#C15C17", + "Total": "#511749", + "Total RAM": "#052B51", + "Total RAM + Swap": "#052B51", + "Total Swap": "#614D93", + "VmallocUsed": "#EA6460" + }, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "${DS_PROMETHEUS}", + "decimals": 2, + "fill": 2, + "fillGradient": 0, + "gridPos": { + "h": 10, + "w": 12, + "x": 12, + "y": 33 + }, + "hiddenSeries": false, + "id": 307, + "legend": { + "alignAsTable": true, + "avg": true, + "current": true, + "max": true, + "min": true, + "rightSide": false, + "show": true, + "sideWidth": null, + "total": false, + "values": true + }, + "lines": true, + "linewidth": 1, + "links": [], + "maxPerRow": 6, + "nullPointMode": "null", + "options": { + "dataLinks": [] + }, + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "irate(node_vmstat_oom_kill{instance=\"$node\",job=\"$job\"}[5m])", + "format": "time_series", + "interval": "", + "intervalFactor": 2, + "legendFormat": "oom killer invocations ", + "refId": "A", + "step": 4 + } + ], + "thresholds": [], + "timeFrom": null, + "timeRegions": [], + "timeShift": null, + "title": "OOM Killer", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "$$hashKey": "object:5373", + "format": "short", + "label": "counter", + "logBase": 1, + "max": null, + "min": "0", + "show": true + }, + { + "$$hashKey": "object:5374", + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": false + } + ], + "yaxis": { + "align": false, + "alignLevel": null + } + } + ], + "repeat": null, + "title": "Memory Vmstat", + "type": "row" + }, + { + "collapsed": true, + "datasource": "${DS_PROMETHEUS}", + "gridPos": { + "h": 1, + "w": 24, + "x": 0, + "y": 23 + }, + "id": 293, + "panels": [ + { + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "${DS_PROMETHEUS}", + "description": "", + "fill": 2, + "fillGradient": 0, + "gridPos": { + "h": 10, + "w": 12, + "x": 0, + "y": 24 + }, + "hiddenSeries": false, + "id": 260, + "legend": { + "alignAsTable": true, + "avg": true, + "current": true, + "max": true, + "min": true, + "show": true, + "total": false, + "values": true + }, + "lines": true, + "linewidth": 1, + "links": [], + "nullPointMode": "null", + "options": { + "dataLinks": [] + }, + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [ + { + "alias": "/.*Variation*./", + "color": "#890F02" + } + ], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "node_timex_estimated_error_seconds{instance=\"$node\",job=\"$job\"}", + "format": "time_series", + "hide": false, + "interval": "", + "intervalFactor": 2, + "legendFormat": "Estimated error in seconds", + "refId": "A", + "step": 240 + }, + { + "expr": "node_timex_offset_seconds{instance=\"$node\",job=\"$job\"}", + "format": "time_series", + "hide": false, + "interval": "", + "intervalFactor": 2, + "legendFormat": "Time offset in between local system and reference clock", + "refId": "B", + "step": 240 + }, + { + "expr": "node_timex_maxerror_seconds{instance=\"$node\",job=\"$job\"}", + "format": "time_series", + "hide": false, + "interval": "", + "intervalFactor": 2, + "legendFormat": "Maximum error in seconds", + "refId": "C", + "step": 240 + } + ], + "thresholds": [], + "timeFrom": null, + "timeRegions": [], + "timeShift": null, + "title": "Time Syncronized Drift", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "format": "s", + "label": "seconds", + "logBase": 1, + "max": null, + "min": null, + "show": true + }, + { + "format": "short", + "label": "counter", + "logBase": 1, + "max": null, + "min": null, + "show": false + } + ], + "yaxis": { + "align": false, + "alignLevel": null + } + }, + { + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "${DS_PROMETHEUS}", + "description": "", + "fill": 2, + "fillGradient": 0, + "gridPos": { + "h": 10, + "w": 12, + "x": 12, + "y": 24 + }, + "hiddenSeries": false, + "id": 291, + "legend": { + "alignAsTable": true, + "avg": true, + "current": true, + "max": true, + "min": true, + "show": true, + "total": false, + "values": true + }, + "lines": true, + "linewidth": 1, + "links": [], + "nullPointMode": "null", + "options": { + "dataLinks": [] + }, + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "node_timex_loop_time_constant{instance=\"$node\",job=\"$job\"}", + "format": "time_series", + "interval": "", + "intervalFactor": 2, + "legendFormat": "Phase-locked loop time adjust", + "refId": "A", + "step": 240 + } + ], + "thresholds": [], + "timeFrom": null, + "timeRegions": [], + "timeShift": null, + "title": "Time PLL Adjust", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "format": "short", + "label": "counter", + "logBase": 1, + "max": null, + "min": null, + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": false + } + ], + "yaxis": { + "align": false, + "alignLevel": null + } + }, + { + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "${DS_PROMETHEUS}", + "description": "", + "fill": 2, + "fillGradient": 0, + "gridPos": { + "h": 10, + "w": 12, + "x": 0, + "y": 34 + }, + "hiddenSeries": false, + "id": 168, + "legend": { + "alignAsTable": true, + "avg": true, + "current": true, + "max": true, + "min": true, + "show": true, + "total": false, + "values": true + }, + "lines": true, + "linewidth": 1, + "links": [], + "nullPointMode": "null", + "options": { + "dataLinks": [] + }, + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [ + { + "alias": "/.*Variation*./", + "color": "#890F02" + } + ], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "node_timex_sync_status{instance=\"$node\",job=\"$job\"}", + "format": "time_series", + "interval": "", + "intervalFactor": 2, + "legendFormat": "Is clock synchronized to a reliable server (1 = yes, 0 = no)", + "refId": "A", + "step": 240 + }, + { + "expr": "node_timex_frequency_adjustment_ratio{instance=\"$node\",job=\"$job\"}", + "format": "time_series", + "interval": "", + "intervalFactor": 2, + "legendFormat": "Local clock frequency adjustment", + "refId": "B", + "step": 240 + } + ], + "thresholds": [], + "timeFrom": null, + "timeRegions": [], + "timeShift": null, + "title": "Time Syncronized Status", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "format": "short", + "label": "counter", + "logBase": 1, + "max": null, + "min": null, + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": false + } + ], + "yaxis": { + "align": false, + "alignLevel": null + } + }, + { + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "${DS_PROMETHEUS}", + "description": "", + "fill": 2, + "fillGradient": 0, + "gridPos": { + "h": 10, + "w": 12, + "x": 12, + "y": 34 + }, + "hiddenSeries": false, + "id": 294, + "legend": { + "alignAsTable": true, + "avg": true, + "current": true, + "max": true, + "min": true, + "show": true, + "total": false, + "values": true + }, + "lines": true, + "linewidth": 1, + "links": [], + "nullPointMode": "null", + "options": { + "dataLinks": [] + }, + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "node_timex_tick_seconds{instance=\"$node\",job=\"$job\"}", + "format": "time_series", + "interval": "", + "intervalFactor": 2, + "legendFormat": "Seconds between clock ticks", + "refId": "A", + "step": 240 + }, + { + "expr": "node_timex_tai_offset_seconds{instance=\"$node\",job=\"$job\"}", + "format": "time_series", + "interval": "", + "intervalFactor": 2, + "legendFormat": "International Atomic Time (TAI) offset", + "refId": "B", + "step": 240 + } + ], + "thresholds": [], + "timeFrom": null, + "timeRegions": [], + "timeShift": null, + "title": "Time Misc", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "format": "s", + "label": "seconds", + "logBase": 1, + "max": null, + "min": null, + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": false + } + ], + "yaxis": { + "align": false, + "alignLevel": null + } + } + ], + "title": "System Timesync", + "type": "row" + }, + { + "collapsed": true, + "datasource": "${DS_PROMETHEUS}", + "gridPos": { + "h": 1, + "w": 24, + "x": 0, + "y": 24 + }, + "id": 312, + "panels": [ + { + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "${DS_PROMETHEUS}", + "fill": 2, + "fillGradient": 0, + "gridPos": { + "h": 10, + "w": 12, + "x": 0, + "y": 7 + }, + "hiddenSeries": false, + "id": 62, + "legend": { + "alignAsTable": true, + "avg": true, + "current": true, + "max": true, + "min": true, + "show": true, + "total": false, + "values": true + }, + "lines": true, + "linewidth": 1, + "links": [], + "maxPerRow": 6, + "nullPointMode": "null", + "options": { + "dataLinks": [] + }, + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "node_procs_blocked{instance=\"$node\",job=\"$job\"}", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "Processes blocked waiting for I/O to complete", + "refId": "A", + "step": 240 + }, + { + "expr": "node_procs_running{instance=\"$node\",job=\"$job\"}", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "Processes in runnable state", + "refId": "B", + "step": 240 + } + ], + "thresholds": [], + "timeFrom": null, + "timeRegions": [], + "timeShift": null, + "title": "Processes Status", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "$$hashKey": "object:6500", + "format": "short", + "label": "counter", + "logBase": 1, + "max": null, + "min": "0", + "show": true + }, + { + "$$hashKey": "object:6501", + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": false + } + ], + "yaxis": { + "align": false, + "alignLevel": null + } + }, + { + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "${DS_PROMETHEUS}", + "fill": 2, + "fillGradient": 0, + "gridPos": { + "h": 10, + "w": 12, + "x": 12, + "y": 7 + }, + "hiddenSeries": false, + "id": 315, + "legend": { + "alignAsTable": true, + "avg": true, + "current": true, + "max": true, + "min": true, + "show": true, + "total": false, + "values": true + }, + "lines": true, + "linewidth": 1, + "links": [], + "maxPerRow": 6, + "nullPointMode": "null", + "options": { + "dataLinks": [] + }, + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [], + "spaceLength": 10, + "stack": true, + "steppedLine": false, + "targets": [ + { + "expr": "node_processes_state{instance=\"$node\",job=\"$job\"}", + "format": "time_series", + "interval": "", + "intervalFactor": 2, + "legendFormat": "{{ state }}", + "refId": "A", + "step": 240 + } + ], + "thresholds": [], + "timeFrom": null, + "timeRegions": [], + "timeShift": null, + "title": "Processes State", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "$$hashKey": "object:6500", + "format": "short", + "label": "counter", + "logBase": 1, + "max": null, + "min": "0", + "show": true + }, + { + "$$hashKey": "object:6501", + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": false + } + ], + "yaxis": { + "align": false, + "alignLevel": null + } + }, + { + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "${DS_PROMETHEUS}", + "fill": 2, + "fillGradient": 0, + "gridPos": { + "h": 10, + "w": 12, + "x": 0, + "y": 17 + }, + "hiddenSeries": false, + "id": 148, + "legend": { + "alignAsTable": true, + "avg": true, + "current": true, + "max": true, + "min": true, + "show": true, + "total": false, + "values": true + }, + "lines": true, + "linewidth": 1, + "links": [], + "maxPerRow": 6, + "nullPointMode": "null", + "options": { + "dataLinks": [] + }, + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "irate(node_forks_total{instance=\"$node\",job=\"$job\"}[5m])", + "format": "time_series", + "hide": false, + "intervalFactor": 2, + "legendFormat": "Processes forks second", + "refId": "A", + "step": 240 + } + ], + "thresholds": [], + "timeFrom": null, + "timeRegions": [], + "timeShift": null, + "title": "Processes Forks", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "$$hashKey": "object:6640", + "format": "short", + "label": "forks / sec", + "logBase": 1, + "max": null, + "min": "0", + "show": true + }, + { + "$$hashKey": "object:6641", + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": false + } + ], + "yaxis": { + "align": false, + "alignLevel": null + } + }, + { + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "${DS_PROMETHEUS}", + "fill": 2, + "fillGradient": 0, + "gridPos": { + "h": 10, + "w": 12, + "x": 12, + "y": 17 + }, + "hiddenSeries": false, + "id": 149, + "legend": { + "alignAsTable": true, + "avg": true, + "current": true, + "max": true, + "min": true, + "show": true, + "total": false, + "values": true + }, + "lines": true, + "linewidth": 1, + "links": [], + "nullPointMode": "null", + "options": { + "dataLinks": [] + }, + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [ + { + "alias": "/.*Max.*/", + "fill": 0 + } + ], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "irate(process_virtual_memory_bytes{instance=\"$node\",job=\"$job\"}[5m])", + "hide": false, + "interval": "", + "intervalFactor": 2, + "legendFormat": "Processes virtual memory size in bytes", + "refId": "A", + "step": 240 + }, + { + "expr": "process_resident_memory_max_bytes{instance=\"$node\",job=\"$job\"}", + "hide": false, + "interval": "", + "intervalFactor": 2, + "legendFormat": "Maximum amount of virtual memory available in bytes", + "refId": "B", + "step": 240 + }, + { + "expr": "irate(process_virtual_memory_bytes{instance=\"$node\",job=\"$job\"}[5m])", + "hide": false, + "interval": "", + "intervalFactor": 2, + "legendFormat": "Processes virtual memory size in bytes", + "refId": "C", + "step": 240 + }, + { + "expr": "irate(process_virtual_memory_max_bytes{instance=\"$node\",job=\"$job\"}[5m])", + "hide": false, + "interval": "", + "intervalFactor": 2, + "legendFormat": "Maximum amount of virtual memory available in bytes", + "refId": "D", + "step": 240 + } + ], + "thresholds": [], + "timeFrom": null, + "timeRegions": [], + "timeShift": null, + "title": "Processes Memory", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "format": "decbytes", + "label": "bytes", + "logBase": 1, + "max": null, + "min": "0", + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": false + } + ], + "yaxis": { + "align": false, + "alignLevel": null + } + }, + { + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "${DS_PROMETHEUS}", + "fill": 2, + "fillGradient": 0, + "gridPos": { + "h": 10, + "w": 12, + "x": 0, + "y": 27 + }, + "hiddenSeries": false, + "id": 313, + "legend": { + "alignAsTable": true, + "avg": true, + "current": true, + "max": true, + "min": true, + "show": true, + "total": false, + "values": true + }, + "lines": true, + "linewidth": 1, + "links": [], + "maxPerRow": 6, + "nullPointMode": "null", + "options": { + "dataLinks": [] + }, + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [ + { + "$$hashKey": "object:709", + "alias": "PIDs limit", + "color": "#F2495C", + "fill": 0 + } + ], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "node_processes_pids{instance=\"$node\",job=\"$job\"}", + "format": "time_series", + "interval": "", + "intervalFactor": 2, + "legendFormat": "Number of PIDs", + "refId": "A", + "step": 240 + }, + { + "expr": "node_processes_max_processes{instance=\"$node\",job=\"$job\"}", + "format": "time_series", + "interval": "", + "intervalFactor": 2, + "legendFormat": "PIDs limit", + "refId": "B", + "step": 240 + } + ], + "thresholds": [], + "timeFrom": null, + "timeRegions": [], + "timeShift": null, + "title": "PIDs Number and Limit", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "$$hashKey": "object:6500", + "format": "short", + "label": "counter", + "logBase": 1, + "max": null, + "min": "0", + "show": true + }, + { + "$$hashKey": "object:6501", + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": false + } + ], + "yaxis": { + "align": false, + "alignLevel": null + } + }, + { + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "${DS_PROMETHEUS}", + "fill": 2, + "fillGradient": 0, + "gridPos": { + "h": 10, + "w": 12, + "x": 12, + "y": 27 + }, + "hiddenSeries": false, + "id": 305, + "legend": { + "alignAsTable": true, + "avg": true, + "current": true, + "max": true, + "min": true, + "show": true, + "total": false, + "values": true + }, + "lines": true, + "linewidth": 1, + "links": [], + "maxPerRow": 6, + "nullPointMode": "null", + "options": { + "dataLinks": [] + }, + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [ + { + "$$hashKey": "object:4963", + "alias": "/.*waiting.*/", + "transform": "negative-Y" + } + ], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "irate(node_schedstat_running_seconds_total{instance=\"$node\",job=\"$job\"}[5m])", + "format": "time_series", + "interval": "", + "intervalFactor": 2, + "legendFormat": "CPU {{ cpu }} - seconds spent running a process", + "refId": "A", + "step": 240 + }, + { + "expr": "irate(node_schedstat_waiting_seconds_total{instance=\"$node\",job=\"$job\"}[5m])", + "format": "time_series", + "interval": "", + "intervalFactor": 2, + "legendFormat": "CPU {{ cpu }} - seconds spent by processing waiting for this CPU", + "refId": "B", + "step": 240 + } + ], + "thresholds": [], + "timeFrom": null, + "timeRegions": [], + "timeShift": null, + "title": "Process schedule stats Running / Waiting", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "$$hashKey": "object:4860", + "format": "s", + "label": "seconds", + "logBase": 1, + "max": null, + "min": null, + "show": true + }, + { + "$$hashKey": "object:4861", + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": false + } + ], + "yaxis": { + "align": false, + "alignLevel": null + } + }, + { + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "${DS_PROMETHEUS}", + "fill": 2, + "fillGradient": 0, + "gridPos": { + "h": 10, + "w": 12, + "x": 12, + "y": 37 + }, + "hiddenSeries": false, + "id": 314, + "legend": { + "alignAsTable": true, + "avg": true, + "current": true, + "max": true, + "min": true, + "show": true, + "total": false, + "values": true + }, + "lines": true, + "linewidth": 1, + "links": [], + "maxPerRow": 6, + "nullPointMode": "null", + "options": { + "dataLinks": [] + }, + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [ + { + "$$hashKey": "object:709", + "alias": "Threads limit", + "color": "#F2495C", + "fill": 0 + } + ], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "node_processes_threads{instance=\"$node\",job=\"$job\"}", + "format": "time_series", + "interval": "", + "intervalFactor": 2, + "legendFormat": "Allocated threads", + "refId": "A", + "step": 240 + }, + { + "expr": "node_processes_max_threads{instance=\"$node\",job=\"$job\"}", + "format": "time_series", + "interval": "", + "intervalFactor": 2, + "legendFormat": "Threads limit", + "refId": "B", + "step": 240 + } + ], + "thresholds": [], + "timeFrom": null, + "timeRegions": [], + "timeShift": null, + "title": "Threads Number and Limit", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "$$hashKey": "object:6500", + "format": "short", + "label": "counter", + "logBase": 1, + "max": null, + "min": "0", + "show": true + }, + { + "$$hashKey": "object:6501", + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": false + } + ], + "yaxis": { + "align": false, + "alignLevel": null + } + } + ], + "title": "System Processes", + "type": "row" + }, + { + "collapsed": true, + "datasource": "${DS_PROMETHEUS}", + "gridPos": { + "h": 1, + "w": 24, + "x": 0, + "y": 25 + }, + "id": 269, + "panels": [ + { + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "${DS_PROMETHEUS}", + "fill": 2, + "fillGradient": 0, + "gridPos": { + "h": 10, + "w": 12, + "x": 0, + "y": 8 + }, + "hiddenSeries": false, + "id": 8, + "legend": { + "alignAsTable": true, + "avg": true, + "current": true, + "max": true, + "min": true, + "show": true, + "total": false, + "values": true + }, + "lines": true, + "linewidth": 1, + "links": [], + "maxPerRow": 6, + "nullPointMode": "null", + "options": { + "dataLinks": [] + }, + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "repeat": null, + "seriesOverrides": [], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "irate(node_context_switches_total{instance=\"$node\",job=\"$job\"}[5m])", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "Context switches", + "refId": "A", + "step": 240 + }, + { + "expr": "irate(node_intr_total{instance=\"$node\",job=\"$job\"}[5m])", + "format": "time_series", + "hide": false, + "intervalFactor": 2, + "legendFormat": "Interrupts", + "refId": "B", + "step": 240 + } + ], + "thresholds": [], + "timeFrom": null, + "timeRegions": [], + "timeShift": null, + "title": "Context Switches / Interrupts", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "format": "short", + "label": "counter", + "logBase": 1, + "max": null, + "min": "0", + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": false + } + ], + "yaxis": { + "align": false, + "alignLevel": null + } + }, + { + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "${DS_PROMETHEUS}", + "fill": 2, + "fillGradient": 0, + "gridPos": { + "h": 10, + "w": 12, + "x": 12, + "y": 8 + }, + "hiddenSeries": false, + "id": 7, + "legend": { + "alignAsTable": true, + "avg": true, + "current": true, + "max": true, + "min": true, + "show": true, + "total": false, + "values": true + }, + "lines": true, + "linewidth": 1, + "links": [], + "maxPerRow": 6, + "nullPointMode": "null", + "options": { + "dataLinks": [] + }, + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "repeat": null, + "seriesOverrides": [], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "node_load1{instance=\"$node\",job=\"$job\"}", + "format": "time_series", + "intervalFactor": 4, + "legendFormat": "Load 1m", + "refId": "A", + "step": 480 + }, + { + "expr": "node_load5{instance=\"$node\",job=\"$job\"}", + "format": "time_series", + "intervalFactor": 4, + "legendFormat": "Load 5m", + "refId": "B", + "step": 480 + }, + { + "expr": "node_load15{instance=\"$node\",job=\"$job\"}", + "format": "time_series", + "intervalFactor": 4, + "legendFormat": "Load 15m", + "refId": "C", + "step": 480 + } + ], + "thresholds": [], + "timeFrom": null, + "timeRegions": [], + "timeShift": null, + "title": "System Load", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "$$hashKey": "object:6261", + "format": "short", + "label": "counter", + "logBase": 1, + "max": null, + "min": "0", + "show": true + }, + { + "$$hashKey": "object:6262", + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": false + } + ], + "yaxis": { + "align": false, + "alignLevel": null + } + }, + { + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "${DS_PROMETHEUS}", + "fill": 2, + "fillGradient": 0, + "gridPos": { + "h": 10, + "w": 12, + "x": 0, + "y": 18 + }, + "hiddenSeries": false, + "id": 259, + "legend": { + "alignAsTable": true, + "avg": true, + "current": true, + "max": true, + "min": true, + "show": true, + "total": false, + "values": true + }, + "lines": true, + "linewidth": 1, + "links": [], + "nullPointMode": "null", + "options": { + "dataLinks": [] + }, + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [ + { + "alias": "/.*Critical*./", + "color": "#E24D42", + "fill": 0 + }, + { + "alias": "/.*Max*./", + "color": "#EF843C", + "fill": 0 + } + ], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "irate(node_interrupts_total{instance=\"$node\",job=\"$job\"}[5m])", + "format": "time_series", + "interval": "", + "intervalFactor": 2, + "legendFormat": "{{ type }} - {{ info }}", + "refId": "A", + "step": 240 + } + ], + "thresholds": [], + "timeFrom": null, + "timeRegions": [], + "timeShift": null, + "title": "Interrupts Detail", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "format": "short", + "label": "counter", + "logBase": 1, + "max": null, + "min": "0", + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": false + } + ], + "yaxis": { + "align": false, + "alignLevel": null + } + }, + { + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "${DS_PROMETHEUS}", + "fill": 2, + "fillGradient": 0, + "gridPos": { + "h": 10, + "w": 12, + "x": 12, + "y": 18 + }, + "hiddenSeries": false, + "id": 306, + "legend": { + "alignAsTable": true, + "avg": true, + "current": true, + "max": true, + "min": true, + "show": true, + "total": false, + "values": true + }, + "lines": true, + "linewidth": 1, + "links": [], + "maxPerRow": 6, + "nullPointMode": "null", + "options": { + "dataLinks": [] + }, + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "irate(node_schedstat_timeslices_total{instance=\"$node\",job=\"$job\"}[5m])", + "format": "time_series", + "interval": "", + "intervalFactor": 2, + "legendFormat": "CPU {{ cpu }}", + "refId": "A", + "step": 240 + } + ], + "thresholds": [], + "timeFrom": null, + "timeRegions": [], + "timeShift": null, + "title": "Schedule timeslices executed by each cpu", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "$$hashKey": "object:4860", + "format": "short", + "label": "counter", + "logBase": 1, + "max": null, + "min": null, + "show": true + }, + { + "$$hashKey": "object:4861", + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": false + } + ], + "yaxis": { + "align": false, + "alignLevel": null + } + }, + { + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "${DS_PROMETHEUS}", + "fill": 2, + "fillGradient": 0, + "gridPos": { + "h": 10, + "w": 12, + "x": 0, + "y": 28 + }, + "hiddenSeries": false, + "id": 151, + "legend": { + "alignAsTable": true, + "avg": true, + "current": true, + "max": true, + "min": true, + "show": true, + "total": false, + "values": true + }, + "lines": true, + "linewidth": 1, + "links": [], + "maxPerRow": 6, + "nullPointMode": "null", + "options": { + "dataLinks": [] + }, + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "node_entropy_available_bits{instance=\"$node\",job=\"$job\"}", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "Entropy available to random number generators", + "refId": "A", + "step": 240 + } + ], + "thresholds": [], + "timeFrom": null, + "timeRegions": [], + "timeShift": null, + "title": "Entropy", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "$$hashKey": "object:6568", + "format": "short", + "label": "counter", + "logBase": 1, + "max": null, + "min": "0", + "show": true + }, + { + "$$hashKey": "object:6569", + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": false + } + ], + "yaxis": { + "align": false, + "alignLevel": null + } + }, + { + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "${DS_PROMETHEUS}", + "fill": 2, + "fillGradient": 0, + "gridPos": { + "h": 10, + "w": 12, + "x": 12, + "y": 28 + }, + "hiddenSeries": false, + "id": 308, + "legend": { + "alignAsTable": true, + "avg": true, + "current": true, + "max": true, + "min": true, + "show": true, + "total": false, + "values": true + }, + "lines": true, + "linewidth": 1, + "links": [], + "maxPerRow": 6, + "nullPointMode": "null", + "options": { + "dataLinks": [] + }, + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "irate(process_cpu_seconds_total{instance=\"$node\",job=\"$job\"}[5m])", + "format": "time_series", + "interval": "", + "intervalFactor": 2, + "legendFormat": "Time spent", + "refId": "A", + "step": 240 + } + ], + "thresholds": [], + "timeFrom": null, + "timeRegions": [], + "timeShift": null, + "title": "CPU time spent in user and system contexts", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "$$hashKey": "object:4860", + "format": "s", + "label": "seconds", + "logBase": 1, + "max": null, + "min": null, + "show": true + }, + { + "$$hashKey": "object:4861", + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": false + } + ], + "yaxis": { + "align": false, + "alignLevel": null + } + }, + { + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "${DS_PROMETHEUS}", + "fill": 2, + "fillGradient": 0, + "gridPos": { + "h": 10, + "w": 12, + "x": 0, + "y": 38 + }, + "hiddenSeries": false, + "id": 64, + "legend": { + "alignAsTable": true, + "avg": true, + "current": true, + "max": true, + "min": true, + "show": true, + "total": false, + "values": true + }, + "lines": true, + "linewidth": 1, + "links": [], + "nullPointMode": "null", + "options": { + "dataLinks": [] + }, + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [ + { + "$$hashKey": "object:6323", + "alias": "/.*Max*./", + "color": "#890F02", + "fill": 0 + } + ], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "process_max_fds{instance=\"$node\",job=\"$job\"}", + "interval": "", + "intervalFactor": 2, + "legendFormat": "Maximum open file descriptors", + "refId": "A", + "step": 240 + }, + { + "expr": "process_open_fds{instance=\"$node\",job=\"$job\"}", + "interval": "", + "intervalFactor": 2, + "legendFormat": "Open file descriptors", + "refId": "B", + "step": 240 + } + ], + "thresholds": [], + "timeFrom": null, + "timeRegions": [], + "timeShift": null, + "title": "File Descriptors", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "$$hashKey": "object:6338", + "format": "short", + "label": "counter", + "logBase": 1, + "max": null, + "min": "0", + "show": true + }, + { + "$$hashKey": "object:6339", + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": false + } + ], + "yaxis": { + "align": false, + "alignLevel": null + } + } + ], + "repeat": null, + "title": "System Misc", + "type": "row" + }, + { + "collapsed": true, + "datasource": "${DS_PROMETHEUS}", + "gridPos": { + "h": 1, + "w": 24, + "x": 0, + "y": 26 + }, + "id": 304, + "panels": [ + { + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "${DS_PROMETHEUS}", + "fill": 2, + "fillGradient": 0, + "gridPos": { + "h": 10, + "w": 12, + "x": 0, + "y": 26 + }, + "hiddenSeries": false, + "id": 158, + "legend": { + "alignAsTable": true, + "avg": true, + "current": true, + "max": true, + "min": true, + "show": true, + "total": false, + "values": true + }, + "lines": true, + "linewidth": 1, + "links": [], + "nullPointMode": "null", + "options": { + "dataLinks": [] + }, + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [ + { + "$$hashKey": "object:6726", + "alias": "/.*Critical*./", + "color": "#E24D42", + "fill": 0 + }, + { + "$$hashKey": "object:6727", + "alias": "/.*Max*./", + "color": "#EF843C", + "fill": 0 + } + ], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "node_hwmon_temp_celsius{instance=\"$node\",job=\"$job\"}", + "format": "time_series", + "interval": "", + "intervalFactor": 2, + "legendFormat": "{{ chip }} {{ sensor }} temp", + "refId": "A", + "step": 240 + }, + { + "expr": "node_hwmon_temp_crit_alarm_celsius{instance=\"$node\",job=\"$job\"}", + "format": "time_series", + "hide": true, + "interval": "", + "intervalFactor": 2, + "legendFormat": "{{ chip }} {{ sensor }} Critical Alarm", + "refId": "B", + "step": 240 + }, + { + "expr": "node_hwmon_temp_crit_celsius{instance=\"$node\",job=\"$job\"}", + "format": "time_series", + "interval": "", + "intervalFactor": 2, + "legendFormat": "{{ chip }} {{ sensor }} Critical", + "refId": "C", + "step": 240 + }, + { + "expr": "node_hwmon_temp_crit_hyst_celsius{instance=\"$node\",job=\"$job\"}", + "format": "time_series", + "hide": true, + "interval": "", + "intervalFactor": 2, + "legendFormat": "{{ chip }} {{ sensor }} Critical Historical", + "refId": "D", + "step": 240 + }, + { + "expr": "node_hwmon_temp_max_celsius{instance=\"$node\",job=\"$job\"}", + "format": "time_series", + "hide": true, + "interval": "", + "intervalFactor": 2, + "legendFormat": "{{ chip }} {{ sensor }} Max", + "refId": "E", + "step": 240 + } + ], + "thresholds": [], + "timeFrom": null, + "timeRegions": [], + "timeShift": null, + "title": "Hardware temperature monitor", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "$$hashKey": "object:6750", + "format": "celsius", + "label": "temperature", + "logBase": 1, + "max": null, + "min": "0", + "show": true + }, + { + "$$hashKey": "object:6751", + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": false + } + ], + "yaxis": { + "align": false, + "alignLevel": null + } + }, + { + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "${DS_PROMETHEUS}", + "fill": 2, + "fillGradient": 0, + "gridPos": { + "h": 10, + "w": 12, + "x": 12, + "y": 26 + }, + "hiddenSeries": false, + "id": 300, + "legend": { + "alignAsTable": true, + "avg": true, + "current": true, + "max": true, + "min": true, + "show": true, + "total": false, + "values": true + }, + "lines": true, + "linewidth": 1, + "links": [], + "nullPointMode": "null", + "options": { + "dataLinks": [] + }, + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [ + { + "$$hashKey": "object:1655", + "alias": "/.*Max*./", + "color": "#EF843C", + "fill": 0 + } + ], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "node_cooling_device_cur_state{instance=\"$node\",job=\"$job\"}", + "format": "time_series", + "hide": false, + "interval": "", + "intervalFactor": 2, + "legendFormat": "Current {{ name }} in {{ type }}", + "refId": "A", + "step": 240 + }, + { + "expr": "node_cooling_device_max_state{instance=\"$node\",job=\"$job\"}", + "format": "time_series", + "interval": "", + "intervalFactor": 2, + "legendFormat": "Max {{ name }} in {{ type }}", + "refId": "B", + "step": 240 + } + ], + "thresholds": [], + "timeFrom": null, + "timeRegions": [], + "timeShift": null, + "title": "Throttle cooling device", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "$$hashKey": "object:1678", + "format": "short", + "label": "counter", + "logBase": 1, + "max": null, + "min": null, + "show": true + }, + { + "$$hashKey": "object:1679", + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": false + } + ], + "yaxis": { + "align": false, + "alignLevel": null + } + }, + { + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "${DS_PROMETHEUS}", + "fill": 2, + "fillGradient": 0, + "gridPos": { + "h": 10, + "w": 12, + "x": 0, + "y": 36 + }, + "hiddenSeries": false, + "id": 302, + "legend": { + "alignAsTable": true, + "avg": true, + "current": true, + "max": true, + "min": true, + "show": true, + "total": false, + "values": true + }, + "lines": true, + "linewidth": 1, + "links": [], + "nullPointMode": "null", + "options": { + "dataLinks": [] + }, + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "node_power_supply_online{instance=\"$node\",job=\"$job\"}", + "format": "time_series", + "hide": false, + "interval": "", + "intervalFactor": 2, + "legendFormat": "{{ power_supply }} online", + "refId": "A", + "step": 240 + } + ], + "thresholds": [], + "timeFrom": null, + "timeRegions": [], + "timeShift": null, + "title": "Power supply", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "$$hashKey": "object:1678", + "format": "short", + "label": "counter", + "logBase": 1, + "max": null, + "min": null, + "show": true + }, + { + "$$hashKey": "object:1679", + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": false + } + ], + "yaxis": { + "align": false, + "alignLevel": null + } + } + ], + "title": "Hardware Misc", + "type": "row" + }, + { + "collapsed": true, + "datasource": "${DS_PROMETHEUS}", + "gridPos": { + "h": 1, + "w": 24, + "x": 0, + "y": 27 + }, + "id": 296, + "panels": [ + { + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "${DS_PROMETHEUS}", + "fill": 2, + "fillGradient": 0, + "gridPos": { + "h": 10, + "w": 12, + "x": 0, + "y": 10 + }, + "hiddenSeries": false, + "id": 297, + "legend": { + "alignAsTable": true, + "avg": true, + "current": true, + "max": true, + "min": true, + "show": true, + "total": false, + "values": true + }, + "lines": true, + "linewidth": 1, + "links": [], + "nullPointMode": "null", + "options": { + "dataLinks": [] + }, + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "irate(node_systemd_socket_accepted_connections_total{instance=\"$node\",job=\"$job\"}[5m])", + "format": "time_series", + "interval": "", + "intervalFactor": 2, + "legendFormat": "{{ name }} Connections", + "refId": "A", + "step": 240 + } + ], + "thresholds": [], + "timeFrom": null, + "timeRegions": [], + "timeShift": null, + "title": "Systemd Sockets", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "format": "short", + "label": "counter", + "logBase": 1, + "max": null, + "min": "0", + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": false + } + ], + "yaxis": { + "align": false, + "alignLevel": null + } + }, + { + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "${DS_PROMETHEUS}", + "fill": 2, + "fillGradient": 0, + "gridPos": { + "h": 10, + "w": 12, + "x": 12, + "y": 10 + }, + "hiddenSeries": false, + "id": 298, + "legend": { + "alignAsTable": true, + "avg": true, + "current": true, + "max": true, + "min": true, + "show": true, + "total": false, + "values": true + }, + "lines": true, + "linewidth": 1, + "links": [], + "nullPointMode": "null", + "options": { + "dataLinks": [] + }, + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [ + { + "alias": "Failed", + "color": "#F2495C" + }, + { + "alias": "Inactive", + "color": "#FF9830" + }, + { + "alias": "Active", + "color": "#73BF69" + }, + { + "alias": "Deactivating", + "color": "#FFCB7D" + }, + { + "alias": "Activating", + "color": "#C8F2C2" + } + ], + "spaceLength": 10, + "stack": true, + "steppedLine": false, + "targets": [ + { + "expr": "node_systemd_units{instance=\"$node\",job=\"$job\",state=\"activating\"}", + "format": "time_series", + "interval": "", + "intervalFactor": 2, + "legendFormat": "Activating", + "refId": "A", + "step": 240 + }, + { + "expr": "node_systemd_units{instance=\"$node\",job=\"$job\",state=\"active\"}", + "format": "time_series", + "interval": "", + "intervalFactor": 2, + "legendFormat": "Active", + "refId": "B", + "step": 240 + }, + { + "expr": "node_systemd_units{instance=\"$node\",job=\"$job\",state=\"deactivating\"}", + "format": "time_series", + "interval": "", + "intervalFactor": 2, + "legendFormat": "Deactivating", + "refId": "C", + "step": 240 + }, + { + "expr": "node_systemd_units{instance=\"$node\",job=\"$job\",state=\"failed\"}", + "format": "time_series", + "interval": "", + "intervalFactor": 2, + "legendFormat": "Failed", + "refId": "D", + "step": 240 + }, + { + "expr": "node_systemd_units{instance=\"$node\",job=\"$job\",state=\"inactive\"}", + "format": "time_series", + "interval": "", + "intervalFactor": 2, + "legendFormat": "Inactive", + "refId": "E", + "step": 240 + } + ], + "thresholds": [], + "timeFrom": null, + "timeRegions": [], + "timeShift": null, + "title": "Systemd Units State", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "format": "short", + "label": "counter", + "logBase": 1, + "max": null, + "min": null, + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": false + } + ], + "yaxis": { + "align": false, + "alignLevel": null + } + } + ], + "title": "Systemd", + "type": "row" + }, + { + "collapsed": true, + "datasource": "${DS_PROMETHEUS}", + "gridPos": { + "h": 1, + "w": 24, + "x": 0, + "y": 28 + }, + "id": 270, + "panels": [ + { + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "${DS_PROMETHEUS}", + "description": "", + "fill": 2, + "fillGradient": 0, + "gridPos": { + "h": 10, + "w": 12, + "x": 0, + "y": 29 + }, + "hiddenSeries": false, + "id": 9, + "legend": { + "alignAsTable": true, + "avg": true, + "current": true, + "hideZero": true, + "max": true, + "min": true, + "rightSide": false, + "show": true, + "total": false, + "values": true + }, + "lines": true, + "linewidth": 1, + "links": [], + "maxPerRow": 6, + "nullPointMode": "null", + "options": { + "dataLinks": [] + }, + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "repeat": null, + "seriesOverrides": [ + { + "$$hashKey": "object:2033", + "alias": "/.*Read.*/", + "transform": "negative-Y" + }, + { + "$$hashKey": "object:2034", + "alias": "/.*sda_.*/", + "color": "#7EB26D" + }, + { + "$$hashKey": "object:2035", + "alias": "/.*sdb_.*/", + "color": "#EAB839" + }, + { + "$$hashKey": "object:2036", + "alias": "/.*sdc_.*/", + "color": "#6ED0E0" + }, + { + "$$hashKey": "object:2037", + "alias": "/.*sdd_.*/", + "color": "#EF843C" + }, + { + "$$hashKey": "object:2038", + "alias": "/.*sde_.*/", + "color": "#E24D42" + }, + { + "$$hashKey": "object:2039", + "alias": "/.*sda1.*/", + "color": "#584477" + }, + { + "$$hashKey": "object:2040", + "alias": "/.*sda2_.*/", + "color": "#BA43A9" + }, + { + "$$hashKey": "object:2041", + "alias": "/.*sda3_.*/", + "color": "#F4D598" + }, + { + "$$hashKey": "object:2042", + "alias": "/.*sdb1.*/", + "color": "#0A50A1" + }, + { + "$$hashKey": "object:2043", + "alias": "/.*sdb2.*/", + "color": "#BF1B00" + }, + { + "$$hashKey": "object:2044", + "alias": "/.*sdb3.*/", + "color": "#E0752D" + }, + { + "$$hashKey": "object:2045", + "alias": "/.*sdc1.*/", + "color": "#962D82" + }, + { + "$$hashKey": "object:2046", + "alias": "/.*sdc2.*/", + "color": "#614D93" + }, + { + "$$hashKey": "object:2047", + "alias": "/.*sdc3.*/", + "color": "#9AC48A" + }, + { + "$$hashKey": "object:2048", + "alias": "/.*sdd1.*/", + "color": "#65C5DB" + }, + { + "$$hashKey": "object:2049", + "alias": "/.*sdd2.*/", + "color": "#F9934E" + }, + { + "$$hashKey": "object:2050", + "alias": "/.*sdd3.*/", + "color": "#EA6460" + }, + { + "$$hashKey": "object:2051", + "alias": "/.*sde1.*/", + "color": "#E0F9D7" + }, + { + "$$hashKey": "object:2052", + "alias": "/.*sdd2.*/", + "color": "#FCEACA" + }, + { + "$$hashKey": "object:2053", + "alias": "/.*sde3.*/", + "color": "#F9E2D2" + } + ], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "irate(node_disk_reads_completed_total{instance=\"$node\",job=\"$job\"}[5m])", + "intervalFactor": 4, + "legendFormat": "{{device}} - Reads completed", + "refId": "A", + "step": 8 + }, + { + "expr": "irate(node_disk_writes_completed_total{instance=\"$node\",job=\"$job\"}[5m])", + "intervalFactor": 2, + "legendFormat": "{{device}} - Writes completed", + "refId": "B", + "step": 4 + } + ], + "thresholds": [], + "timeFrom": null, + "timeRegions": [], + "timeShift": null, + "title": "Disk IOps Completed", + "tooltip": { + "shared": false, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "$$hashKey": "object:2186", + "format": "iops", + "label": "IO read (-) / write (+)", + "logBase": 1, + "max": null, + "min": null, + "show": true + }, + { + "$$hashKey": "object:2187", + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": false + } + ], + "yaxis": { + "align": false, + "alignLevel": null + } + }, + { + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "${DS_PROMETHEUS}", + "description": "", + "fill": 2, + "fillGradient": 0, + "gridPos": { + "h": 10, + "w": 12, + "x": 12, + "y": 29 + }, + "hiddenSeries": false, + "id": 33, + "legend": { + "alignAsTable": true, + "avg": true, + "current": true, + "hideZero": true, + "max": true, + "min": true, + "rightSide": false, + "show": true, + "total": false, + "values": true + }, + "lines": true, + "linewidth": 1, + "links": [], + "maxPerRow": 6, + "nullPointMode": "null", + "options": { + "dataLinks": [] + }, + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [ + { + "alias": "/.*Read.*/", + "transform": "negative-Y" + }, + { + "alias": "/.*sda_.*/", + "color": "#7EB26D" + }, + { + "alias": "/.*sdb_.*/", + "color": "#EAB839" + }, + { + "alias": "/.*sdc_.*/", + "color": "#6ED0E0" + }, + { + "alias": "/.*sdd_.*/", + "color": "#EF843C" + }, + { + "alias": "/.*sde_.*/", + "color": "#E24D42" + }, + { + "alias": "/.*sda1.*/", + "color": "#584477" + }, + { + "alias": "/.*sda2_.*/", + "color": "#BA43A9" + }, + { + "alias": "/.*sda3_.*/", + "color": "#F4D598" + }, + { + "alias": "/.*sdb1.*/", + "color": "#0A50A1" + }, + { + "alias": "/.*sdb2.*/", + "color": "#BF1B00" + }, + { + "alias": "/.*sdb3.*/", + "color": "#E0752D" + }, + { + "alias": "/.*sdc1.*/", + "color": "#962D82" + }, + { + "alias": "/.*sdc2.*/", + "color": "#614D93" + }, + { + "alias": "/.*sdc3.*/", + "color": "#9AC48A" + }, + { + "alias": "/.*sdd1.*/", + "color": "#65C5DB" + }, + { + "alias": "/.*sdd2.*/", + "color": "#F9934E" + }, + { + "alias": "/.*sdd3.*/", + "color": "#EA6460" + }, + { + "alias": "/.*sde1.*/", + "color": "#E0F9D7" + }, + { + "alias": "/.*sdd2.*/", + "color": "#FCEACA" + }, + { + "alias": "/.*sde3.*/", + "color": "#F9E2D2" + } + ], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "irate(node_disk_read_bytes_total{instance=\"$node\",job=\"$job\"}[5m])", + "format": "time_series", + "intervalFactor": 4, + "legendFormat": "{{device}} - Read bytes", + "refId": "A", + "step": 8 + }, + { + "expr": "irate(node_disk_written_bytes_total{instance=\"$node\",job=\"$job\"}[5m])", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "{{device}} - Written bytes", + "refId": "B", + "step": 4 + } + ], + "thresholds": [], + "timeFrom": null, + "timeRegions": [], + "timeShift": null, + "title": "Disk R/W Data", + "tooltip": { + "shared": false, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "format": "Bps", + "label": "bytes read (-) / write (+)", + "logBase": 1, + "max": null, + "min": null, + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": false + } + ], + "yaxis": { + "align": false, + "alignLevel": null + } + }, + { + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "${DS_PROMETHEUS}", + "description": "", + "fill": 3, + "fillGradient": 0, + "gridPos": { + "h": 10, + "w": 12, + "x": 0, + "y": 39 + }, + "hiddenSeries": false, + "id": 37, + "legend": { + "alignAsTable": true, + "avg": true, + "current": true, + "hideZero": true, + "max": true, + "min": true, + "rightSide": false, + "show": true, + "sort": "current", + "sortDesc": true, + "total": false, + "values": true + }, + "lines": true, + "linewidth": 1, + "links": [], + "maxPerRow": 6, + "nullPointMode": "null", + "options": { + "dataLinks": [] + }, + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [ + { + "alias": "/.*Read.*/", + "transform": "negative-Y" + }, + { + "alias": "/.*sda_.*/", + "color": "#7EB26D" + }, + { + "alias": "/.*sdb_.*/", + "color": "#EAB839" + }, + { + "alias": "/.*sdc_.*/", + "color": "#6ED0E0" + }, + { + "alias": "/.*sdd_.*/", + "color": "#EF843C" + }, + { + "alias": "/.*sde_.*/", + "color": "#E24D42" + }, + { + "alias": "/.*sda1.*/", + "color": "#584477" + }, + { + "alias": "/.*sda2_.*/", + "color": "#BA43A9" + }, + { + "alias": "/.*sda3_.*/", + "color": "#F4D598" + }, + { + "alias": "/.*sdb1.*/", + "color": "#0A50A1" + }, + { + "alias": "/.*sdb2.*/", + "color": "#BF1B00" + }, + { + "alias": "/.*sdb3.*/", + "color": "#E0752D" + }, + { + "alias": "/.*sdc1.*/", + "color": "#962D82" + }, + { + "alias": "/.*sdc2.*/", + "color": "#614D93" + }, + { + "alias": "/.*sdc3.*/", + "color": "#9AC48A" + }, + { + "alias": "/.*sdd1.*/", + "color": "#65C5DB" + }, + { + "alias": "/.*sdd2.*/", + "color": "#F9934E" + }, + { + "alias": "/.*sdd3.*/", + "color": "#EA6460" + }, + { + "alias": "/.*sde1.*/", + "color": "#E0F9D7" + }, + { + "alias": "/.*sdd2.*/", + "color": "#FCEACA" + }, + { + "alias": "/.*sde3.*/", + "color": "#F9E2D2" + } + ], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "irate(node_disk_read_time_seconds_total{instance=\"$node\",job=\"$job\"}[5m])", + "hide": false, + "intervalFactor": 4, + "legendFormat": "{{device}} - Read time", + "refId": "A", + "step": 8 + }, + { + "expr": "irate(node_disk_write_time_seconds_total{instance=\"$node\",job=\"$job\"}[5m])", + "hide": false, + "intervalFactor": 2, + "legendFormat": "{{device}} - Write time", + "refId": "B", + "step": 4 + } + ], + "thresholds": [], + "timeFrom": null, + "timeRegions": [], + "timeShift": null, + "title": "Disk R/W Time", + "tooltip": { + "shared": false, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "format": "s", + "label": "time. read (-) / write (+)", + "logBase": 1, + "max": null, + "min": null, + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": false + } + ], + "yaxis": { + "align": false, + "alignLevel": null + } + }, + { + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "${DS_PROMETHEUS}", + "description": "", + "fill": 2, + "fillGradient": 0, + "gridPos": { + "h": 10, + "w": 12, + "x": 12, + "y": 39 + }, + "hiddenSeries": false, + "id": 35, + "legend": { + "alignAsTable": true, + "avg": true, + "current": true, + "hideZero": true, + "max": true, + "min": true, + "rightSide": false, + "show": true, + "sort": "current", + "sortDesc": true, + "total": false, + "values": true + }, + "lines": true, + "linewidth": 1, + "links": [], + "maxPerRow": 6, + "nullPointMode": "null", + "options": { + "dataLinks": [] + }, + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [ + { + "alias": "/.*sda_.*/", + "color": "#7EB26D" + }, + { + "alias": "/.*sdb_.*/", + "color": "#EAB839" + }, + { + "alias": "/.*sdc_.*/", + "color": "#6ED0E0" + }, + { + "alias": "/.*sdd_.*/", + "color": "#EF843C" + }, + { + "alias": "/.*sde_.*/", + "color": "#E24D42" + }, + { + "alias": "/.*sda1.*/", + "color": "#584477" + }, + { + "alias": "/.*sda2_.*/", + "color": "#BA43A9" + }, + { + "alias": "/.*sda3_.*/", + "color": "#F4D598" + }, + { + "alias": "/.*sdb1.*/", + "color": "#0A50A1" + }, + { + "alias": "/.*sdb2.*/", + "color": "#BF1B00" + }, + { + "alias": "/.*sdb3.*/", + "color": "#E0752D" + }, + { + "alias": "/.*sdc1.*/", + "color": "#962D82" + }, + { + "alias": "/.*sdc2.*/", + "color": "#614D93" + }, + { + "alias": "/.*sdc3.*/", + "color": "#9AC48A" + }, + { + "alias": "/.*sdd1.*/", + "color": "#65C5DB" + }, + { + "alias": "/.*sdd2.*/", + "color": "#F9934E" + }, + { + "alias": "/.*sdd3.*/", + "color": "#EA6460" + }, + { + "alias": "/.*sde1.*/", + "color": "#E0F9D7" + }, + { + "alias": "/.*sdd2.*/", + "color": "#FCEACA" + }, + { + "alias": "/.*sde3.*/", + "color": "#F9E2D2" + } + ], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "irate(node_disk_io_time_weighted_seconds_total{instance=\"$node\",job=\"$job\"}[5m])", + "intervalFactor": 4, + "legendFormat": "{{device}} - IO time weighted", + "refId": "A", + "step": 8 + } + ], + "thresholds": [], + "timeFrom": null, + "timeRegions": [], + "timeShift": null, + "title": "Disk IOs Weighted", + "tooltip": { + "shared": false, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "format": "s", + "label": "time", + "logBase": 1, + "max": null, + "min": "0", + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": false + } + ], + "yaxis": { + "align": false, + "alignLevel": null + } + }, + { + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "${DS_PROMETHEUS}", + "description": "", + "fill": 2, + "fillGradient": 0, + "gridPos": { + "h": 10, + "w": 12, + "x": 0, + "y": 49 + }, + "hiddenSeries": false, + "id": 133, + "legend": { + "alignAsTable": true, + "avg": true, + "current": true, + "hideZero": true, + "max": true, + "min": true, + "rightSide": false, + "show": true, + "sort": "current", + "sortDesc": true, + "total": false, + "values": true + }, + "lines": true, + "linewidth": 1, + "links": [], + "maxPerRow": 6, + "nullPointMode": "null", + "options": { + "dataLinks": [] + }, + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [ + { + "alias": "/.*Read.*/", + "transform": "negative-Y" + }, + { + "alias": "/.*sda_.*/", + "color": "#7EB26D" + }, + { + "alias": "/.*sdb_.*/", + "color": "#EAB839" + }, + { + "alias": "/.*sdc_.*/", + "color": "#6ED0E0" + }, + { + "alias": "/.*sdd_.*/", + "color": "#EF843C" + }, + { + "alias": "/.*sde_.*/", + "color": "#E24D42" + }, + { + "alias": "/.*sda1.*/", + "color": "#584477" + }, + { + "alias": "/.*sda2_.*/", + "color": "#BA43A9" + }, + { + "alias": "/.*sda3_.*/", + "color": "#F4D598" + }, + { + "alias": "/.*sdb1.*/", + "color": "#0A50A1" + }, + { + "alias": "/.*sdb2.*/", + "color": "#BF1B00" + }, + { + "alias": "/.*sdb3.*/", + "color": "#E0752D" + }, + { + "alias": "/.*sdc1.*/", + "color": "#962D82" + }, + { + "alias": "/.*sdc2.*/", + "color": "#614D93" + }, + { + "alias": "/.*sdc3.*/", + "color": "#9AC48A" + }, + { + "alias": "/.*sdd1.*/", + "color": "#65C5DB" + }, + { + "alias": "/.*sdd2.*/", + "color": "#F9934E" + }, + { + "alias": "/.*sdd3.*/", + "color": "#EA6460" + }, + { + "alias": "/.*sde1.*/", + "color": "#E0F9D7" + }, + { + "alias": "/.*sdd2.*/", + "color": "#FCEACA" + }, + { + "alias": "/.*sde3.*/", + "color": "#F9E2D2" + } + ], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "irate(node_disk_reads_merged_total{instance=\"$node\",job=\"$job\"}[5m])", + "intervalFactor": 2, + "legendFormat": "{{device}} - Read merged", + "refId": "A", + "step": 4 + }, + { + "expr": "irate(node_disk_writes_merged_total{instance=\"$node\",job=\"$job\"}[5m])", + "intervalFactor": 2, + "legendFormat": "{{device}} - Write merged", + "refId": "B", + "step": 4 + } + ], + "thresholds": [], + "timeFrom": null, + "timeRegions": [], + "timeShift": null, + "title": "Disk R/W Merged", + "tooltip": { + "shared": false, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "format": "iops", + "label": "I/Os", + "logBase": 1, + "max": null, + "min": null, + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": false + } + ], + "yaxis": { + "align": false, + "alignLevel": null + } + }, + { + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "${DS_PROMETHEUS}", + "description": "", + "fill": 3, + "fillGradient": 0, + "gridPos": { + "h": 10, + "w": 12, + "x": 12, + "y": 49 + }, + "hiddenSeries": false, + "id": 36, + "legend": { + "alignAsTable": true, + "avg": true, + "current": true, + "hideZero": true, + "max": true, + "min": true, + "rightSide": false, + "show": true, + "sort": "current", + "sortDesc": true, + "total": false, + "values": true + }, + "lines": true, + "linewidth": 1, + "links": [], + "maxPerRow": 6, + "nullPointMode": "null", + "options": { + "dataLinks": [] + }, + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [ + { + "alias": "/.*sda_.*/", + "color": "#7EB26D" + }, + { + "alias": "/.*sdb_.*/", + "color": "#EAB839" + }, + { + "alias": "/.*sdc_.*/", + "color": "#6ED0E0" + }, + { + "alias": "/.*sdd_.*/", + "color": "#EF843C" + }, + { + "alias": "/.*sde_.*/", + "color": "#E24D42" + }, + { + "alias": "/.*sda1.*/", + "color": "#584477" + }, + { + "alias": "/.*sda2_.*/", + "color": "#BA43A9" + }, + { + "alias": "/.*sda3_.*/", + "color": "#F4D598" + }, + { + "alias": "/.*sdb1.*/", + "color": "#0A50A1" + }, + { + "alias": "/.*sdb2.*/", + "color": "#BF1B00" + }, + { + "alias": "/.*sdb3.*/", + "color": "#E0752D" + }, + { + "alias": "/.*sdc1.*/", + "color": "#962D82" + }, + { + "alias": "/.*sdc2.*/", + "color": "#614D93" + }, + { + "alias": "/.*sdc3.*/", + "color": "#9AC48A" + }, + { + "alias": "/.*sdd1.*/", + "color": "#65C5DB" + }, + { + "alias": "/.*sdd2.*/", + "color": "#F9934E" + }, + { + "alias": "/.*sdd3.*/", + "color": "#EA6460" + }, + { + "alias": "/.*sde1.*/", + "color": "#E0F9D7" + }, + { + "alias": "/.*sdd2.*/", + "color": "#FCEACA" + }, + { + "alias": "/.*sde3.*/", + "color": "#F9E2D2" + } + ], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "irate(node_disk_io_time_seconds_total{instance=\"$node\",job=\"$job\"}[5m])", + "intervalFactor": 4, + "legendFormat": "{{device}} - IO time", + "refId": "A", + "step": 8 + }, + { + "expr": "irate(node_disk_discard_time_seconds_total{instance=\"$node\",job=\"$job\"}[5m])", + "interval": "", + "intervalFactor": 4, + "legendFormat": "{{device}} - discard time", + "refId": "B", + "step": 8 + } + ], + "thresholds": [], + "timeFrom": null, + "timeRegions": [], + "timeShift": null, + "title": "Time Spent Doing I/Os", + "tooltip": { + "shared": false, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "format": "s", + "label": "time", + "logBase": 1, + "max": null, + "min": "0", + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": false + } + ], + "yaxis": { + "align": false, + "alignLevel": null + } + }, + { + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "${DS_PROMETHEUS}", + "description": "", + "fill": 2, + "fillGradient": 0, + "gridPos": { + "h": 10, + "w": 12, + "x": 0, + "y": 59 + }, + "hiddenSeries": false, + "id": 34, + "legend": { + "alignAsTable": true, + "avg": true, + "current": true, + "hideZero": true, + "max": true, + "min": true, + "rightSide": false, + "show": true, + "sort": "current", + "sortDesc": true, + "total": false, + "values": true + }, + "lines": true, + "linewidth": 1, + "links": [], + "maxPerRow": 6, + "nullPointMode": "null", + "options": { + "dataLinks": [] + }, + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [ + { + "alias": "/.*sda_.*/", + "color": "#7EB26D" + }, + { + "alias": "/.*sdb_.*/", + "color": "#EAB839" + }, + { + "alias": "/.*sdc_.*/", + "color": "#6ED0E0" + }, + { + "alias": "/.*sdd_.*/", + "color": "#EF843C" + }, + { + "alias": "/.*sde_.*/", + "color": "#E24D42" + }, + { + "alias": "/.*sda1.*/", + "color": "#584477" + }, + { + "alias": "/.*sda2_.*/", + "color": "#BA43A9" + }, + { + "alias": "/.*sda3_.*/", + "color": "#F4D598" + }, + { + "alias": "/.*sdb1.*/", + "color": "#0A50A1" + }, + { + "alias": "/.*sdb2.*/", + "color": "#BF1B00" + }, + { + "alias": "/.*sdb3.*/", + "color": "#E0752D" + }, + { + "alias": "/.*sdc1.*/", + "color": "#962D82" + }, + { + "alias": "/.*sdc2.*/", + "color": "#614D93" + }, + { + "alias": "/.*sdc3.*/", + "color": "#9AC48A" + }, + { + "alias": "/.*sdd1.*/", + "color": "#65C5DB" + }, + { + "alias": "/.*sdd2.*/", + "color": "#F9934E" + }, + { + "alias": "/.*sdd3.*/", + "color": "#EA6460" + }, + { + "alias": "/.*sde1.*/", + "color": "#E0F9D7" + }, + { + "alias": "/.*sdd2.*/", + "color": "#FCEACA" + }, + { + "alias": "/.*sde3.*/", + "color": "#F9E2D2" + } + ], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "irate(node_disk_io_now{instance=\"$node\",job=\"$job\"}[5m])", + "intervalFactor": 4, + "legendFormat": "{{device}} - IO now", + "refId": "A", + "step": 8 + } + ], + "thresholds": [], + "timeFrom": null, + "timeRegions": [], + "timeShift": null, + "title": "Disk IOs Current in Progress", + "tooltip": { + "shared": false, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "format": "iops", + "label": "I/Os", + "logBase": 1, + "max": null, + "min": "0", + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": false + } + ], + "yaxis": { + "align": false, + "alignLevel": null + } + }, + { + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "${DS_PROMETHEUS}", + "description": "", + "fill": 2, + "fillGradient": 0, + "gridPos": { + "h": 10, + "w": 12, + "x": 12, + "y": 59 + }, + "hiddenSeries": false, + "id": 301, + "legend": { + "alignAsTable": true, + "avg": true, + "current": true, + "hideZero": true, + "max": true, + "min": true, + "rightSide": false, + "show": true, + "total": false, + "values": true + }, + "lines": true, + "linewidth": 1, + "links": [], + "maxPerRow": 6, + "nullPointMode": "null as zero", + "options": { + "dataLinks": [] + }, + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [ + { + "$$hashKey": "object:2034", + "alias": "/.*sda_.*/", + "color": "#7EB26D" + }, + { + "$$hashKey": "object:2035", + "alias": "/.*sdb_.*/", + "color": "#EAB839" + }, + { + "$$hashKey": "object:2036", + "alias": "/.*sdc_.*/", + "color": "#6ED0E0" + }, + { + "$$hashKey": "object:2037", + "alias": "/.*sdd_.*/", + "color": "#EF843C" + }, + { + "$$hashKey": "object:2038", + "alias": "/.*sde_.*/", + "color": "#E24D42" + }, + { + "$$hashKey": "object:2039", + "alias": "/.*sda1.*/", + "color": "#584477" + }, + { + "$$hashKey": "object:2040", + "alias": "/.*sda2_.*/", + "color": "#BA43A9" + }, + { + "$$hashKey": "object:2041", + "alias": "/.*sda3_.*/", + "color": "#F4D598" + }, + { + "$$hashKey": "object:2042", + "alias": "/.*sdb1.*/", + "color": "#0A50A1" + }, + { + "$$hashKey": "object:2043", + "alias": "/.*sdb2.*/", + "color": "#BF1B00" + }, + { + "$$hashKey": "object:2044", + "alias": "/.*sdb3.*/", + "color": "#E0752D" + }, + { + "$$hashKey": "object:2045", + "alias": "/.*sdc1.*/", + "color": "#962D82" + }, + { + "$$hashKey": "object:2046", + "alias": "/.*sdc2.*/", + "color": "#614D93" + }, + { + "$$hashKey": "object:2047", + "alias": "/.*sdc3.*/", + "color": "#9AC48A" + }, + { + "$$hashKey": "object:2048", + "alias": "/.*sdd1.*/", + "color": "#65C5DB" + }, + { + "$$hashKey": "object:2049", + "alias": "/.*sdd2.*/", + "color": "#F9934E" + }, + { + "$$hashKey": "object:2050", + "alias": "/.*sdd3.*/", + "color": "#EA6460" + }, + { + "$$hashKey": "object:2051", + "alias": "/.*sde1.*/", + "color": "#E0F9D7" + }, + { + "$$hashKey": "object:2052", + "alias": "/.*sdd2.*/", + "color": "#FCEACA" + }, + { + "$$hashKey": "object:2053", + "alias": "/.*sde3.*/", + "color": "#F9E2D2" + } + ], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "irate(node_disk_discards_completed_total{instance=\"$node\",job=\"$job\"}[5m])", + "interval": "", + "intervalFactor": 4, + "legendFormat": "{{device}} - Discards completed", + "refId": "A", + "step": 8 + }, + { + "expr": "irate(node_disk_discards_merged_total{instance=\"$node\",job=\"$job\"}[5m])", + "interval": "", + "intervalFactor": 2, + "legendFormat": "{{device}} - Discards merged", + "refId": "B", + "step": 4 + } + ], + "thresholds": [], + "timeFrom": null, + "timeRegions": [], + "timeShift": null, + "title": "Disk IOps Discards completed / merged", + "tooltip": { + "shared": false, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "$$hashKey": "object:2186", + "format": "iops", + "label": "IOs", + "logBase": 1, + "max": null, + "min": null, + "show": true + }, + { + "$$hashKey": "object:2187", + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": false + } + ], + "yaxis": { + "align": false, + "alignLevel": null + } + } + ], + "repeat": null, + "title": "Storage Disk", + "type": "row" + }, + { + "collapsed": true, + "datasource": "${DS_PROMETHEUS}", + "gridPos": { + "h": 1, + "w": 24, + "x": 0, + "y": 29 + }, + "id": 271, + "panels": [ + { + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "${DS_PROMETHEUS}", + "decimals": 3, + "description": "", + "fill": 2, + "fillGradient": 0, + "gridPos": { + "h": 10, + "w": 12, + "x": 0, + "y": 78 + }, + "hiddenSeries": false, + "id": 43, + "legend": { + "alignAsTable": true, + "avg": true, + "current": true, + "max": true, + "min": true, + "rightSide": false, + "show": true, + "total": false, + "values": true + }, + "lines": true, + "linewidth": 1, + "links": [], + "maxPerRow": 6, + "nullPointMode": "null", + "options": { + "dataLinks": [] + }, + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "node_filesystem_avail_bytes{instance=\"$node\",job=\"$job\",device!~'rootfs'}", + "format": "time_series", + "hide": false, + "intervalFactor": 2, + "legendFormat": "{{mountpoint}} - Available", + "metric": "", + "refId": "A", + "step": 4 + }, + { + "expr": "node_filesystem_free_bytes{instance=\"$node\",job=\"$job\",device!~'rootfs'}", + "format": "time_series", + "hide": true, + "intervalFactor": 2, + "legendFormat": "{{mountpoint}} - Free", + "refId": "B", + "step": 2 + }, + { + "expr": "node_filesystem_size_bytes{instance=\"$node\",job=\"$job\",device!~'rootfs'}", + "format": "time_series", + "hide": true, + "intervalFactor": 2, + "legendFormat": "{{mountpoint}} - Size", + "refId": "C", + "step": 2 + } + ], + "thresholds": [], + "timeFrom": null, + "timeRegions": [], + "timeShift": null, + "title": "Filesystem space available", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "$$hashKey": "object:3826", + "format": "bytes", + "label": "bytes", + "logBase": 1, + "max": null, + "min": "0", + "show": true + }, + { + "$$hashKey": "object:3827", + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": false + } + ], + "yaxis": { + "align": false, + "alignLevel": null + } + }, + { + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "${DS_PROMETHEUS}", + "description": "", + "fill": 2, + "fillGradient": 0, + "gridPos": { + "h": 10, + "w": 12, + "x": 12, + "y": 78 + }, + "hiddenSeries": false, + "id": 41, + "legend": { + "alignAsTable": true, + "avg": true, + "current": true, + "hideZero": true, + "max": true, + "min": true, + "rightSide": false, + "show": true, + "total": false, + "values": true + }, + "lines": true, + "linewidth": 1, + "links": [], + "nullPointMode": "null", + "options": { + "dataLinks": [] + }, + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "node_filesystem_files_free{instance=\"$node\",job=\"$job\",device!~'rootfs'}", + "format": "time_series", + "hide": false, + "intervalFactor": 2, + "legendFormat": "{{mountpoint}} - Free file nodes", + "refId": "A", + "step": 4 + } + ], + "thresholds": [], + "timeFrom": null, + "timeRegions": [], + "timeShift": null, + "title": "File Nodes Free", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "$$hashKey": "object:3894", + "format": "short", + "label": "file nodes", + "logBase": 1, + "max": null, + "min": "0", + "show": true + }, + { + "$$hashKey": "object:3895", + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": false + } + ], + "yaxis": { + "align": false, + "alignLevel": null + } + }, + { + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "${DS_PROMETHEUS}", + "description": "", + "fill": 2, + "fillGradient": 0, + "gridPos": { + "h": 10, + "w": 12, + "x": 0, + "y": 88 + }, + "hiddenSeries": false, + "id": 28, + "legend": { + "alignAsTable": true, + "avg": true, + "current": true, + "max": true, + "min": true, + "show": true, + "total": false, + "values": true + }, + "lines": true, + "linewidth": 1, + "links": [], + "maxPerRow": 6, + "nullPointMode": "null", + "options": { + "dataLinks": [] + }, + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "node_filefd_maximum{instance=\"$node\",job=\"$job\"}", + "format": "time_series", + "intervalFactor": 4, + "legendFormat": "Max open files", + "refId": "A", + "step": 8 + }, + { + "expr": "node_filefd_allocated{instance=\"$node\",job=\"$job\"}", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "Open files", + "refId": "B", + "step": 4 + } + ], + "thresholds": [], + "timeFrom": null, + "timeRegions": [], + "timeShift": null, + "title": "File Descriptor", + "tooltip": { + "shared": false, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "format": "short", + "label": "files", + "logBase": 1, + "max": null, + "min": "0", + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": false + } + ], + "yaxis": { + "align": false, + "alignLevel": null + } + }, + { + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "${DS_PROMETHEUS}", + "description": "", + "fill": 2, + "fillGradient": 0, + "gridPos": { + "h": 10, + "w": 12, + "x": 12, + "y": 88 + }, + "hiddenSeries": false, + "id": 219, + "legend": { + "alignAsTable": true, + "avg": true, + "current": true, + "hideZero": true, + "max": true, + "min": true, + "rightSide": false, + "show": true, + "total": false, + "values": true + }, + "lines": true, + "linewidth": 1, + "links": [], + "nullPointMode": "null", + "options": { + "dataLinks": [] + }, + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "node_filesystem_files{instance=\"$node\",job=\"$job\",device!~'rootfs'}", + "format": "time_series", + "hide": false, + "intervalFactor": 2, + "legendFormat": "{{mountpoint}} - File nodes total", + "refId": "A", + "step": 4 + } + ], + "thresholds": [], + "timeFrom": null, + "timeRegions": [], + "timeShift": null, + "title": "File Nodes Size", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "format": "short", + "label": "file Nodes", + "logBase": 1, + "max": null, + "min": "0", + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": false + } + ], + "yaxis": { + "align": false, + "alignLevel": null + } + }, + { + "aliasColors": { + "/ ReadOnly": "#890F02" + }, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "${DS_PROMETHEUS}", + "decimals": null, + "description": "", + "fill": 2, + "fillGradient": 0, + "gridPos": { + "h": 10, + "w": 12, + "x": 0, + "y": 98 + }, + "hiddenSeries": false, + "id": 44, + "legend": { + "alignAsTable": true, + "avg": true, + "current": true, + "hideEmpty": true, + "hideZero": true, + "max": true, + "min": true, + "rightSide": false, + "show": true, + "total": false, + "values": true + }, + "lines": true, + "linewidth": 1, + "links": [], + "maxPerRow": 6, + "nullPointMode": "null", + "options": { + "dataLinks": [] + }, + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [], + "spaceLength": 10, + "stack": true, + "steppedLine": false, + "targets": [ + { + "expr": "node_filesystem_readonly{instance=\"$node\",job=\"$job\",device!~'rootfs'}", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "{{mountpoint}} - ReadOnly", + "refId": "A", + "step": 4 + }, + { + "expr": "node_filesystem_device_error{instance=\"$node\",job=\"$job\",device!~'rootfs',fstype!~'tmpfs'}", + "format": "time_series", + "interval": "", + "intervalFactor": 2, + "legendFormat": "{{mountpoint}} - Device error", + "refId": "B", + "step": 4 + } + ], + "thresholds": [], + "timeFrom": null, + "timeRegions": [], + "timeShift": null, + "title": "Filesystem in ReadOnly / Error", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "$$hashKey": "object:3670", + "format": "short", + "label": "counter", + "logBase": 1, + "max": "1", + "min": "0", + "show": true + }, + { + "$$hashKey": "object:3671", + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": false + } + ], + "yaxis": { + "align": false, + "alignLevel": null + } + } + ], + "repeat": null, + "title": "Storage Filesystem", + "type": "row" + }, + { + "collapsed": true, + "datasource": "${DS_PROMETHEUS}", + "gridPos": { + "h": 1, + "w": 24, + "x": 0, + "y": 30 + }, + "id": 272, + "panels": [ + { + "aliasColors": { + "receive_packets_eth0": "#7EB26D", + "receive_packets_lo": "#E24D42", + "transmit_packets_eth0": "#7EB26D", + "transmit_packets_lo": "#E24D42" + }, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "${DS_PROMETHEUS}", + "fill": 2, + "fillGradient": 0, + "gridPos": { + "h": 10, + "w": 12, + "x": 0, + "y": 30 + }, + "hiddenSeries": false, + "id": 60, + "legend": { + "alignAsTable": true, + "avg": true, + "current": true, + "max": true, + "min": true, + "rightSide": false, + "show": true, + "sideWidth": 300, + "total": false, + "values": true + }, + "lines": true, + "linewidth": 1, + "links": [], + "nullPointMode": "null", + "options": { + "dataLinks": [] + }, + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [ + { + "alias": "/.*Trans.*/", + "transform": "negative-Y" + } + ], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "irate(node_network_receive_packets_total{instance=\"$node\",job=\"$job\"}[5m])", + "format": "time_series", + "interval": "", + "intervalFactor": 2, + "legendFormat": "{{device}} - Receive", + "refId": "A", + "step": 4 + }, + { + "expr": "irate(node_network_transmit_packets_total{instance=\"$node\",job=\"$job\"}[5m])", + "format": "time_series", + "interval": "", + "intervalFactor": 2, + "legendFormat": "{{device}} - Transmit", + "refId": "B", + "step": 4 + } + ], + "thresholds": [], + "timeFrom": null, + "timeRegions": [], + "timeShift": null, + "title": "Network Traffic by Packets", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "format": "pps", + "label": "packets out (-) / in (+)", + "logBase": 1, + "max": null, + "min": null, + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": false + } + ], + "yaxis": { + "align": false, + "alignLevel": null + } + }, + { + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "${DS_PROMETHEUS}", + "fill": 2, + "fillGradient": 0, + "gridPos": { + "h": 10, + "w": 12, + "x": 12, + "y": 30 + }, + "hiddenSeries": false, + "id": 142, + "legend": { + "alignAsTable": true, + "avg": true, + "current": true, + "hideEmpty": false, + "hideZero": false, + "max": true, + "min": true, + "rightSide": false, + "show": true, + "sideWidth": 300, + "sort": "current", + "sortDesc": true, + "total": false, + "values": true + }, + "lines": true, + "linewidth": 1, + "links": [], + "nullPointMode": "null", + "options": { + "dataLinks": [] + }, + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [ + { + "alias": "/.*Trans.*/", + "transform": "negative-Y" + } + ], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "irate(node_network_receive_errs_total{instance=\"$node\",job=\"$job\"}[5m])", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "{{device}} - Receive errors", + "refId": "A", + "step": 4 + }, + { + "expr": "irate(node_network_transmit_errs_total{instance=\"$node\",job=\"$job\"}[5m])", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "{{device}} - Rransmit errors", + "refId": "B", + "step": 4 + } + ], + "thresholds": [], + "timeFrom": null, + "timeRegions": [], + "timeShift": null, + "title": "Network Traffic Errors", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "format": "pps", + "label": "packets out (-) / in (+)", + "logBase": 1, + "max": null, + "min": null, + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": false + } + ], + "yaxis": { + "align": false, + "alignLevel": null + } + }, + { + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "${DS_PROMETHEUS}", + "fill": 2, + "fillGradient": 0, + "gridPos": { + "h": 10, + "w": 12, + "x": 0, + "y": 40 + }, + "hiddenSeries": false, + "id": 143, + "legend": { + "alignAsTable": true, + "avg": true, + "current": true, + "hideEmpty": false, + "hideZero": false, + "max": true, + "min": true, + "rightSide": false, + "show": true, + "sideWidth": 300, + "sort": "current", + "sortDesc": true, + "total": false, + "values": true + }, + "lines": true, + "linewidth": 1, + "links": [], + "nullPointMode": "null", + "options": { + "dataLinks": [] + }, + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [ + { + "alias": "/.*Trans.*/", + "transform": "negative-Y" + } + ], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "irate(node_network_receive_drop_total{instance=\"$node\",job=\"$job\"}[5m])", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "{{device}} - Receive drop", + "refId": "A", + "step": 4 + }, + { + "expr": "irate(node_network_transmit_drop_total{instance=\"$node\",job=\"$job\"}[5m])", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "{{device}} - Transmit drop", + "refId": "B", + "step": 4 + } + ], + "thresholds": [], + "timeFrom": null, + "timeRegions": [], + "timeShift": null, + "title": "Network Traffic Drop", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "format": "pps", + "label": "packets out (-) / in (+)", + "logBase": 1, + "max": null, + "min": null, + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": false + } + ], + "yaxis": { + "align": false, + "alignLevel": null + } + }, + { + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "${DS_PROMETHEUS}", + "fill": 2, + "fillGradient": 0, + "gridPos": { + "h": 10, + "w": 12, + "x": 12, + "y": 40 + }, + "hiddenSeries": false, + "id": 141, + "legend": { + "alignAsTable": true, + "avg": true, + "current": true, + "hideEmpty": false, + "hideZero": false, + "max": true, + "min": true, + "rightSide": false, + "show": true, + "sideWidth": 300, + "sort": "current", + "sortDesc": true, + "total": false, + "values": true + }, + "lines": true, + "linewidth": 1, + "links": [], + "nullPointMode": "null", + "options": { + "dataLinks": [] + }, + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [ + { + "alias": "/.*Trans.*/", + "transform": "negative-Y" + } + ], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "irate(node_network_receive_compressed_total{instance=\"$node\",job=\"$job\"}[5m])", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "{{device}} - Receive compressed", + "refId": "A", + "step": 4 + }, + { + "expr": "irate(node_network_transmit_compressed_total{instance=\"$node\",job=\"$job\"}[5m])", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "{{device}} - Transmit compressed", + "refId": "B", + "step": 4 + } + ], + "thresholds": [], + "timeFrom": null, + "timeRegions": [], + "timeShift": null, + "title": "Network Traffic Compressed", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "format": "pps", + "label": "packets out (-) / in (+)", + "logBase": 1, + "max": null, + "min": null, + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": false + } + ], + "yaxis": { + "align": false, + "alignLevel": null + } + }, + { + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "${DS_PROMETHEUS}", + "fill": 2, + "fillGradient": 0, + "gridPos": { + "h": 10, + "w": 12, + "x": 0, + "y": 50 + }, + "hiddenSeries": false, + "id": 146, + "legend": { + "alignAsTable": true, + "avg": true, + "current": true, + "hideEmpty": false, + "hideZero": false, + "max": true, + "min": true, + "rightSide": false, + "show": true, + "sideWidth": 300, + "sort": "current", + "sortDesc": true, + "total": false, + "values": true + }, + "lines": true, + "linewidth": 1, + "links": [], + "nullPointMode": "null", + "options": { + "dataLinks": [] + }, + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [ + { + "alias": "/.*Trans.*/", + "transform": "negative-Y" + } + ], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "irate(node_network_receive_multicast_total{instance=\"$node\",job=\"$job\"}[5m])", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "{{device}} - Receive multicast", + "refId": "A", + "step": 4 + } + ], + "thresholds": [], + "timeFrom": null, + "timeRegions": [], + "timeShift": null, + "title": "Network Traffic Multicast", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "format": "pps", + "label": "packets out (-) / in (+)", + "logBase": 1, + "max": null, + "min": null, + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": false + } + ], + "yaxis": { + "align": false, + "alignLevel": null + } + }, + { + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "${DS_PROMETHEUS}", + "fill": 2, + "fillGradient": 0, + "gridPos": { + "h": 10, + "w": 12, + "x": 12, + "y": 50 + }, + "hiddenSeries": false, + "id": 144, + "legend": { + "alignAsTable": true, + "avg": true, + "current": true, + "hideEmpty": false, + "hideZero": false, + "max": true, + "min": true, + "rightSide": false, + "show": true, + "sideWidth": 300, + "sort": "current", + "sortDesc": true, + "total": false, + "values": true + }, + "lines": true, + "linewidth": 1, + "links": [], + "nullPointMode": "null", + "options": { + "dataLinks": [] + }, + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [ + { + "alias": "/.*Trans.*/", + "transform": "negative-Y" + } + ], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "irate(node_network_receive_fifo_total{instance=\"$node\",job=\"$job\"}[5m])", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "{{device}} - Receive fifo", + "refId": "A", + "step": 4 + }, + { + "expr": "irate(node_network_transmit_fifo_total{instance=\"$node\",job=\"$job\"}[5m])", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "{{device}} - Transmit fifo", + "refId": "B", + "step": 4 + } + ], + "thresholds": [], + "timeFrom": null, + "timeRegions": [], + "timeShift": null, + "title": "Network Traffic Fifo", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "format": "pps", + "label": "packets out (-) / in (+)", + "logBase": 1, + "max": null, + "min": null, + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": false + } + ], + "yaxis": { + "align": false, + "alignLevel": null + } + }, + { + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "${DS_PROMETHEUS}", + "fill": 2, + "fillGradient": 0, + "gridPos": { + "h": 10, + "w": 12, + "x": 0, + "y": 60 + }, + "hiddenSeries": false, + "id": 145, + "legend": { + "alignAsTable": true, + "avg": true, + "current": true, + "hideEmpty": false, + "hideZero": false, + "max": true, + "min": true, + "rightSide": false, + "show": true, + "sideWidth": 300, + "sort": "current", + "sortDesc": true, + "total": false, + "values": true + }, + "lines": true, + "linewidth": 1, + "links": [], + "nullPointMode": "null", + "options": { + "dataLinks": [] + }, + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [ + { + "$$hashKey": "object:576", + "alias": "/.*Trans.*/", + "transform": "negative-Y" + } + ], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "irate(node_network_receive_frame_total{instance=\"$node\",job=\"$job\"}[5m])", + "format": "time_series", + "hide": false, + "intervalFactor": 2, + "legendFormat": "{{device}} - Receive frame", + "refId": "A", + "step": 4 + } + ], + "thresholds": [], + "timeFrom": null, + "timeRegions": [], + "timeShift": null, + "title": "Network Traffic Frame", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "$$hashKey": "object:589", + "format": "pps", + "label": "packets out (-) / in (+)", + "logBase": 1, + "max": null, + "min": null, + "show": true + }, + { + "$$hashKey": "object:590", + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": false + } + ], + "yaxis": { + "align": false, + "alignLevel": null + } + }, + { + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "${DS_PROMETHEUS}", + "fill": 2, + "fillGradient": 0, + "gridPos": { + "h": 10, + "w": 12, + "x": 12, + "y": 60 + }, + "hiddenSeries": false, + "id": 231, + "legend": { + "alignAsTable": true, + "avg": true, + "current": true, + "hideEmpty": false, + "hideZero": false, + "max": true, + "min": true, + "rightSide": false, + "show": true, + "sideWidth": 300, + "sort": "current", + "sortDesc": true, + "total": false, + "values": true + }, + "lines": true, + "linewidth": 1, + "links": [], + "nullPointMode": "null", + "options": { + "dataLinks": [] + }, + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "irate(node_network_transmit_carrier_total{instance=\"$node\",job=\"$job\"}[5m])", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "{{device}} - Statistic transmit_carrier", + "refId": "A", + "step": 4 + } + ], + "thresholds": [], + "timeFrom": null, + "timeRegions": [], + "timeShift": null, + "title": "Network Traffic Carrier", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "format": "short", + "label": "counter", + "logBase": 1, + "max": null, + "min": null, + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": false + } + ], + "yaxis": { + "align": false, + "alignLevel": null + } + }, + { + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "${DS_PROMETHEUS}", + "fill": 2, + "fillGradient": 0, + "gridPos": { + "h": 10, + "w": 12, + "x": 0, + "y": 70 + }, + "hiddenSeries": false, + "id": 232, + "legend": { + "alignAsTable": true, + "avg": true, + "current": true, + "hideEmpty": false, + "hideZero": false, + "max": true, + "min": true, + "rightSide": false, + "show": true, + "sideWidth": 300, + "sort": "current", + "sortDesc": true, + "total": false, + "values": true + }, + "lines": true, + "linewidth": 1, + "links": [], + "nullPointMode": "null", + "options": { + "dataLinks": [] + }, + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [ + { + "alias": "/.*Trans.*/", + "transform": "negative-Y" + } + ], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "irate(node_network_transmit_colls_total{instance=\"$node\",job=\"$job\"}[5m])", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "{{device}} - Transmit colls", + "refId": "A", + "step": 4 + } + ], + "thresholds": [], + "timeFrom": null, + "timeRegions": [], + "timeShift": null, + "title": "Network Traffic Colls", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "format": "short", + "label": "counter", + "logBase": 1, + "max": null, + "min": null, + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": false + } + ], + "yaxis": { + "align": false, + "alignLevel": null + } + }, + { + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "${DS_PROMETHEUS}", + "fill": 2, + "fillGradient": 0, + "gridPos": { + "h": 10, + "w": 12, + "x": 12, + "y": 70 + }, + "hiddenSeries": false, + "id": 61, + "legend": { + "alignAsTable": true, + "avg": true, + "current": true, + "max": true, + "min": true, + "rightSide": false, + "show": true, + "total": false, + "values": true + }, + "lines": true, + "linewidth": 1, + "links": [], + "nullPointMode": "null", + "options": { + "dataLinks": [] + }, + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [ + { + "$$hashKey": "object:663", + "alias": "NF conntrack limit", + "color": "#890F02", + "fill": 0 + } + ], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "node_nf_conntrack_entries{instance=\"$node\",job=\"$job\"}", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "NF conntrack entries", + "refId": "A", + "step": 4 + }, + { + "expr": "node_nf_conntrack_entries_limit{instance=\"$node\",job=\"$job\"}", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "NF conntrack limit", + "refId": "B", + "step": 4 + } + ], + "thresholds": [], + "timeFrom": null, + "timeRegions": [], + "timeShift": null, + "title": "NF Contrack", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "$$hashKey": "object:678", + "format": "short", + "label": "entries", + "logBase": 1, + "max": null, + "min": "0", + "show": true + }, + { + "$$hashKey": "object:679", + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": false + } + ], + "yaxis": { + "align": false, + "alignLevel": null + } + }, + { + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "${DS_PROMETHEUS}", + "fill": 2, + "fillGradient": 0, + "gridPos": { + "h": 10, + "w": 12, + "x": 0, + "y": 80 + }, + "hiddenSeries": false, + "id": 230, + "legend": { + "alignAsTable": true, + "avg": true, + "current": true, + "max": true, + "min": true, + "rightSide": false, + "show": true, + "total": false, + "values": true + }, + "lines": true, + "linewidth": 1, + "links": [], + "nullPointMode": "null", + "options": { + "dataLinks": [] + }, + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "node_arp_entries{instance=\"$node\",job=\"$job\"}", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "{{ device }} - ARP entries", + "refId": "A", + "step": 4 + } + ], + "thresholds": [], + "timeFrom": null, + "timeRegions": [], + "timeShift": null, + "title": "ARP Entries", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "format": "short", + "label": "Entries", + "logBase": 1, + "max": null, + "min": "0", + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": false + } + ], + "yaxis": { + "align": false, + "alignLevel": null + } + }, + { + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "${DS_PROMETHEUS}", + "fill": 2, + "fillGradient": 0, + "gridPos": { + "h": 10, + "w": 12, + "x": 12, + "y": 80 + }, + "hiddenSeries": false, + "id": 288, + "legend": { + "alignAsTable": true, + "avg": true, + "current": true, + "max": true, + "min": true, + "rightSide": false, + "show": true, + "total": false, + "values": true + }, + "lines": true, + "linewidth": 1, + "links": [], + "nullPointMode": "null", + "options": { + "dataLinks": [] + }, + "percentage": false, + "pointradius": 1, + "points": false, + "renderer": "flot", + "seriesOverrides": [], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "node_network_mtu_bytes{instance=\"$node\",job=\"$job\"}", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "{{ device }} - Bytes", + "refId": "A", + "step": 4 + } + ], + "thresholds": [], + "timeFrom": null, + "timeRegions": [], + "timeShift": null, + "title": "MTU", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "decimals": 0, + "format": "bytes", + "label": "bytes", + "logBase": 1, + "max": null, + "min": "0", + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": false + } + ], + "yaxis": { + "align": false, + "alignLevel": null + } + }, + { + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "${DS_PROMETHEUS}", + "fill": 2, + "fillGradient": 0, + "gridPos": { + "h": 10, + "w": 12, + "x": 0, + "y": 90 + }, + "hiddenSeries": false, + "id": 280, + "legend": { + "alignAsTable": true, + "avg": true, + "current": true, + "max": true, + "min": true, + "rightSide": false, + "show": true, + "total": false, + "values": true + }, + "lines": true, + "linewidth": 1, + "links": [], + "nullPointMode": "null", + "options": { + "dataLinks": [] + }, + "percentage": false, + "pointradius": 1, + "points": false, + "renderer": "flot", + "seriesOverrides": [], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "node_network_speed_bytes{instance=\"$node\",job=\"$job\"}", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "{{ device }} - Speed", + "refId": "A", + "step": 4 + } + ], + "thresholds": [], + "timeFrom": null, + "timeRegions": [], + "timeShift": null, + "title": "Speed", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "decimals": 0, + "format": "bytes", + "label": "bytes", + "logBase": 1, + "max": null, + "min": "0", + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": false + } + ], + "yaxis": { + "align": false, + "alignLevel": null + } + }, + { + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "${DS_PROMETHEUS}", + "fill": 2, + "fillGradient": 0, + "gridPos": { + "h": 10, + "w": 12, + "x": 12, + "y": 90 + }, + "hiddenSeries": false, + "id": 289, + "legend": { + "alignAsTable": true, + "avg": true, + "current": true, + "max": true, + "min": true, + "rightSide": false, + "show": true, + "total": false, + "values": true + }, + "lines": true, + "linewidth": 1, + "links": [], + "nullPointMode": "null", + "options": { + "dataLinks": [] + }, + "percentage": false, + "pointradius": 1, + "points": false, + "renderer": "flot", + "seriesOverrides": [], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "node_network_transmit_queue_length{instance=\"$node\",job=\"$job\"}", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "{{ device }} - Interface transmit queue length", + "refId": "A", + "step": 4 + } + ], + "thresholds": [], + "timeFrom": null, + "timeRegions": [], + "timeShift": null, + "title": "Queue Length", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "decimals": 0, + "format": "none", + "label": "packets", + "logBase": 1, + "max": null, + "min": "0", + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": false + } + ], + "yaxis": { + "align": false, + "alignLevel": null + } + }, + { + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "${DS_PROMETHEUS}", + "fill": 2, + "fillGradient": 0, + "gridPos": { + "h": 10, + "w": 12, + "x": 0, + "y": 100 + }, + "hiddenSeries": false, + "id": 290, + "legend": { + "alignAsTable": true, + "avg": true, + "current": true, + "hideEmpty": false, + "hideZero": false, + "max": true, + "min": true, + "rightSide": false, + "show": true, + "sideWidth": 300, + "sort": "current", + "sortDesc": true, + "total": false, + "values": true + }, + "lines": true, + "linewidth": 1, + "links": [], + "nullPointMode": "null", + "options": { + "dataLinks": [] + }, + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [ + { + "$$hashKey": "object:232", + "alias": "/.*Dropped.*/", + "transform": "negative-Y" + } + ], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "irate(node_softnet_processed_total{instance=\"$node\",job=\"$job\"}[5m])", + "format": "time_series", + "interval": "", + "intervalFactor": 2, + "legendFormat": "CPU {{cpu}} - Processed", + "refId": "A", + "step": 4 + }, + { + "expr": "irate(node_softnet_dropped_total{instance=\"$node\",job=\"$job\"}[5m])", + "format": "time_series", + "interval": "", + "intervalFactor": 2, + "legendFormat": "CPU {{cpu}} - Dropped", + "refId": "B", + "step": 4 + } + ], + "thresholds": [], + "timeFrom": null, + "timeRegions": [], + "timeShift": null, + "title": "Softnet Packets", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "$$hashKey": "object:207", + "format": "short", + "label": "packetes drop (-) / process (+)", + "logBase": 1, + "max": null, + "min": null, + "show": true + }, + { + "$$hashKey": "object:208", + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": false + } + ], + "yaxis": { + "align": false, + "alignLevel": null + } + }, + { + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "${DS_PROMETHEUS}", + "fill": 2, + "fillGradient": 0, + "gridPos": { + "h": 10, + "w": 12, + "x": 12, + "y": 100 + }, + "hiddenSeries": false, + "id": 310, + "legend": { + "alignAsTable": true, + "avg": true, + "current": true, + "hideEmpty": false, + "hideZero": false, + "max": true, + "min": true, + "rightSide": false, + "show": true, + "sideWidth": 300, + "sort": "current", + "sortDesc": true, + "total": false, + "values": true + }, + "lines": true, + "linewidth": 1, + "links": [], + "nullPointMode": "null", + "options": { + "dataLinks": [] + }, + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "irate(node_softnet_times_squeezed_total{instance=\"$node\",job=\"$job\"}[5m])", + "format": "time_series", + "interval": "", + "intervalFactor": 2, + "legendFormat": "CPU {{cpu}} - Squeezed", + "refId": "A", + "step": 4 + } + ], + "thresholds": [], + "timeFrom": null, + "timeRegions": [], + "timeShift": null, + "title": "Softnet Out of Quota", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "$$hashKey": "object:207", + "format": "short", + "label": "counter", + "logBase": 1, + "max": null, + "min": null, + "show": true + }, + { + "$$hashKey": "object:208", + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": false + } + ], + "yaxis": { + "align": false, + "alignLevel": null + } + }, + { + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "${DS_PROMETHEUS}", + "fill": 2, + "fillGradient": 0, + "gridPos": { + "h": 10, + "w": 12, + "x": 0, + "y": 110 + }, + "hiddenSeries": false, + "id": 309, + "legend": { + "alignAsTable": true, + "avg": true, + "current": true, + "hideEmpty": false, + "hideZero": false, + "max": true, + "min": true, + "rightSide": false, + "show": true, + "sideWidth": 300, + "sort": "current", + "sortDesc": true, + "total": false, + "values": true + }, + "lines": true, + "linewidth": 1, + "links": [], + "nullPointMode": "null", + "options": { + "dataLinks": [] + }, + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "node_network_up{operstate=\"up\",instance=\"$node\",job=\"$job\"}", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "{{interface}} - Operational state UP", + "refId": "A", + "step": 4 + }, + { + "expr": "node_network_carrier{instance=\"$node\",job=\"$job\"}", + "format": "time_series", + "instant": false, + "legendFormat": "{{device}} - Physical link state", + "refId": "B" + } + ], + "thresholds": [], + "timeFrom": null, + "timeRegions": [], + "timeShift": null, + "title": "Network Operational Status", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "format": "short", + "label": "counter", + "logBase": 1, + "max": null, + "min": null, + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": false + } + ], + "yaxis": { + "align": false, + "alignLevel": null + } + } + ], + "repeat": null, + "title": "Network Traffic", + "type": "row" + }, + { + "collapsed": true, + "datasource": "${DS_PROMETHEUS}", + "gridPos": { + "h": 1, + "w": 24, + "x": 0, + "y": 31 + }, + "id": 273, + "panels": [ + { + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "${DS_PROMETHEUS}", + "fill": 2, + "fillGradient": 0, + "gridPos": { + "h": 10, + "w": 12, + "x": 0, + "y": 13 + }, + "hiddenSeries": false, + "id": 63, + "legend": { + "alignAsTable": true, + "avg": true, + "current": true, + "hideEmpty": false, + "hideZero": false, + "max": true, + "min": true, + "rightSide": false, + "show": true, + "sideWidth": 300, + "total": false, + "values": true + }, + "lines": true, + "linewidth": 1, + "links": [], + "nullPointMode": "null", + "options": { + "dataLinks": [] + }, + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "node_sockstat_TCP_alloc{instance=\"$node\",job=\"$job\"}", + "format": "time_series", + "interval": "", + "intervalFactor": 2, + "legendFormat": "TCP_alloc - Allocated sockets", + "refId": "A", + "step": 240 + }, + { + "expr": "node_sockstat_TCP_inuse{instance=\"$node\",job=\"$job\"}", + "format": "time_series", + "interval": "", + "intervalFactor": 2, + "legendFormat": "TCP_inuse - Tcp sockets currently in use", + "refId": "B", + "step": 240 + }, + { + "expr": "node_sockstat_TCP_mem{instance=\"$node\",job=\"$job\"}", + "format": "time_series", + "hide": true, + "interval": "", + "intervalFactor": 2, + "legendFormat": "TCP_mem - Used memory for tcp", + "refId": "C", + "step": 240 + }, + { + "expr": "node_sockstat_TCP_orphan{instance=\"$node\",job=\"$job\"}", + "format": "time_series", + "interval": "", + "intervalFactor": 2, + "legendFormat": "TCP_orphan - Orphan sockets", + "refId": "D", + "step": 240 + }, + { + "expr": "node_sockstat_TCP_tw{instance=\"$node\",job=\"$job\"}", + "format": "time_series", + "interval": "", + "intervalFactor": 2, + "legendFormat": "TCP_tw - Sockets wating close", + "refId": "E", + "step": 240 + } + ], + "thresholds": [], + "timeFrom": null, + "timeRegions": [], + "timeShift": null, + "title": "Sockstat TCP", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "format": "short", + "label": "counter", + "logBase": 1, + "max": null, + "min": "0", + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": false + } + ], + "yaxis": { + "align": false, + "alignLevel": null + } + }, + { + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "${DS_PROMETHEUS}", + "fill": 2, + "fillGradient": 0, + "gridPos": { + "h": 10, + "w": 12, + "x": 12, + "y": 13 + }, + "hiddenSeries": false, + "id": 124, + "legend": { + "alignAsTable": true, + "avg": true, + "current": true, + "hideEmpty": false, + "hideZero": false, + "max": true, + "min": true, + "rightSide": false, + "show": true, + "sideWidth": 300, + "total": false, + "values": true + }, + "lines": true, + "linewidth": 1, + "links": [], + "nullPointMode": "null", + "options": { + "dataLinks": [] + }, + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "node_sockstat_UDPLITE_inuse{instance=\"$node\",job=\"$job\"}", + "format": "time_series", + "interval": "", + "intervalFactor": 2, + "legendFormat": "UDPLITE_inuse - Udplite sockets currently in use", + "refId": "A", + "step": 240 + }, + { + "expr": "node_sockstat_UDP_inuse{instance=\"$node\",job=\"$job\"}", + "format": "time_series", + "interval": "", + "intervalFactor": 2, + "legendFormat": "UDP_inuse - Udp sockets currently in use", + "refId": "B", + "step": 240 + }, + { + "expr": "node_sockstat_UDP_mem{instance=\"$node\",job=\"$job\"}", + "format": "time_series", + "interval": "", + "intervalFactor": 2, + "legendFormat": "UDP_mem - Used memory for udp", + "refId": "C", + "step": 240 + } + ], + "thresholds": [], + "timeFrom": null, + "timeRegions": [], + "timeShift": null, + "title": "Sockstat UDP", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "format": "short", + "label": "counter", + "logBase": 1, + "max": null, + "min": "0", + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": false + } + ], + "yaxis": { + "align": false, + "alignLevel": null + } + }, + { + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "${DS_PROMETHEUS}", + "fill": 2, + "fillGradient": 0, + "gridPos": { + "h": 10, + "w": 12, + "x": 0, + "y": 23 + }, + "hiddenSeries": false, + "id": 126, + "legend": { + "alignAsTable": true, + "avg": true, + "current": true, + "hideEmpty": false, + "hideZero": false, + "max": true, + "min": true, + "rightSide": false, + "show": true, + "sideWidth": 300, + "total": false, + "values": true + }, + "lines": true, + "linewidth": 1, + "links": [], + "nullPointMode": "null", + "options": { + "dataLinks": [] + }, + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "node_sockstat_sockets_used{instance=\"$node\",job=\"$job\"}", + "format": "time_series", + "interval": "", + "intervalFactor": 2, + "legendFormat": "Sockets_used - Sockets currently in use", + "refId": "A", + "step": 240 + } + ], + "thresholds": [], + "timeFrom": null, + "timeRegions": [], + "timeShift": null, + "title": "Sockstat Used", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "format": "short", + "label": "sockets", + "logBase": 1, + "max": null, + "min": "0", + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": false + } + ], + "yaxis": { + "align": false, + "alignLevel": null + } + }, + { + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "${DS_PROMETHEUS}", + "fill": 2, + "fillGradient": 0, + "gridPos": { + "h": 10, + "w": 12, + "x": 12, + "y": 23 + }, + "hiddenSeries": false, + "id": 220, + "legend": { + "alignAsTable": true, + "avg": true, + "current": true, + "hideEmpty": false, + "hideZero": false, + "max": true, + "min": true, + "rightSide": false, + "show": true, + "sideWidth": 300, + "total": false, + "values": true + }, + "lines": true, + "linewidth": 1, + "links": [], + "nullPointMode": "null", + "options": { + "dataLinks": [] + }, + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "node_sockstat_TCP_mem_bytes{instance=\"$node\",job=\"$job\"}", + "format": "time_series", + "interval": "", + "intervalFactor": 2, + "legendFormat": "mem_bytes - TCP sockets in that state", + "refId": "A", + "step": 240 + }, + { + "expr": "node_sockstat_UDP_mem_bytes{instance=\"$node\",job=\"$job\"}", + "format": "time_series", + "interval": "", + "intervalFactor": 2, + "legendFormat": "mem_bytes - UDP sockets in that state", + "refId": "B", + "step": 240 + } + ], + "thresholds": [], + "timeFrom": null, + "timeRegions": [], + "timeShift": null, + "title": "Sockstat Memory Size", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "format": "bytes", + "label": "bytes", + "logBase": 1, + "max": null, + "min": "0", + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": false + } + ], + "yaxis": { + "align": false, + "alignLevel": null + } + }, + { + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "${DS_PROMETHEUS}", + "fill": 2, + "fillGradient": 0, + "gridPos": { + "h": 10, + "w": 12, + "x": 0, + "y": 33 + }, + "hiddenSeries": false, + "id": 125, + "legend": { + "alignAsTable": true, + "avg": true, + "current": true, + "hideEmpty": false, + "hideZero": false, + "max": true, + "min": true, + "rightSide": false, + "show": true, + "sideWidth": 300, + "total": false, + "values": true + }, + "lines": true, + "linewidth": 1, + "links": [], + "nullPointMode": "null", + "options": { + "dataLinks": [] + }, + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "node_sockstat_FRAG_inuse{instance=\"$node\",job=\"$job\"}", + "format": "time_series", + "interval": "", + "intervalFactor": 2, + "legendFormat": "FRAG_inuse - Frag sockets currently in use", + "refId": "A", + "step": 240 + }, + { + "expr": "node_sockstat_FRAG_memory{instance=\"$node\",job=\"$job\"}", + "format": "time_series", + "interval": "", + "intervalFactor": 2, + "legendFormat": "FRAG_memory - Used memory for frag", + "refId": "B", + "step": 240 + }, + { + "expr": "node_sockstat_RAW_inuse{instance=\"$node\",job=\"$job\"}", + "format": "time_series", + "interval": "", + "intervalFactor": 2, + "legendFormat": "RAW_inuse - Raw sockets currently in use", + "refId": "C", + "step": 240 + } + ], + "thresholds": [], + "timeFrom": null, + "timeRegions": [], + "timeShift": null, + "title": "Sockstat FRAG / RAW", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "$$hashKey": "object:1572", + "format": "short", + "label": "counter", + "logBase": 1, + "max": null, + "min": "0", + "show": true + }, + { + "$$hashKey": "object:1573", + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": false + } + ], + "yaxis": { + "align": false, + "alignLevel": null + } + } + ], + "repeat": null, + "title": "Network Sockstat", + "type": "row" + }, + { + "collapsed": true, + "datasource": "${DS_PROMETHEUS}", + "gridPos": { + "h": 1, + "w": 24, + "x": 0, + "y": 32 + }, + "id": 274, + "panels": [ + { + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "${DS_PROMETHEUS}", + "fill": 2, + "fillGradient": 0, + "gridPos": { + "h": 10, + "w": 12, + "x": 0, + "y": 32 + }, + "height": "", + "hiddenSeries": false, + "id": 221, + "legend": { + "alignAsTable": true, + "avg": true, + "current": true, + "hideEmpty": false, + "hideZero": false, + "max": true, + "min": true, + "rightSide": false, + "show": true, + "sideWidth": 300, + "sort": "current", + "sortDesc": true, + "total": false, + "values": true + }, + "lines": true, + "linewidth": 1, + "links": [], + "maxPerRow": 12, + "nullPointMode": "null", + "options": { + "dataLinks": [] + }, + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [ + { + "$$hashKey": "object:1876", + "alias": "/.*Out.*/", + "transform": "negative-Y" + } + ], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "irate(node_netstat_IpExt_InOctets{instance=\"$node\",job=\"$job\"}[5m])", + "format": "time_series", + "interval": "", + "intervalFactor": 2, + "legendFormat": "InOctets - Received octets", + "refId": "A", + "step": 4 + }, + { + "expr": "irate(node_netstat_IpExt_OutOctets{instance=\"$node\",job=\"$job\"}[5m])", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "OutOctets - Sent octets", + "refId": "B", + "step": 4 + } + ], + "thresholds": [], + "timeFrom": null, + "timeRegions": [], + "timeShift": null, + "title": "Netstat IP In / Out Octets", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "$$hashKey": "object:1889", + "format": "short", + "label": "octects out (-) / in (+)", + "logBase": 1, + "max": null, + "min": null, + "show": true + }, + { + "$$hashKey": "object:1890", + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": false + } + ], + "yaxis": { + "align": false, + "alignLevel": null + } + }, + { + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "${DS_PROMETHEUS}", + "fill": 2, + "fillGradient": 0, + "gridPos": { + "h": 10, + "w": 12, + "x": 12, + "y": 32 + }, + "height": "", + "hiddenSeries": false, + "id": 81, + "legend": { + "alignAsTable": true, + "avg": true, + "current": true, + "hideEmpty": false, + "hideZero": false, + "max": true, + "min": true, + "rightSide": false, + "show": true, + "sideWidth": 300, + "sort": "current", + "sortDesc": true, + "total": false, + "values": true + }, + "lines": true, + "linewidth": 1, + "links": [], + "nullPointMode": "null", + "options": { + "dataLinks": [] + }, + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "irate(node_netstat_Ip_Forwarding{instance=\"$node\",job=\"$job\"}[5m])", + "format": "time_series", + "interval": "", + "intervalFactor": 2, + "legendFormat": "Forwarding - IP forwarding", + "refId": "A", + "step": 4 + } + ], + "thresholds": [], + "timeFrom": null, + "timeRegions": [], + "timeShift": null, + "title": "Netstat IP Forwarding", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "$$hashKey": "object:1957", + "format": "short", + "label": "datagrams", + "logBase": 1, + "max": null, + "min": "0", + "show": true + }, + { + "$$hashKey": "object:1958", + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": false + } + ], + "yaxis": { + "align": false, + "alignLevel": null + } + }, + { + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "${DS_PROMETHEUS}", + "decimals": null, + "fill": 2, + "fillGradient": 0, + "gridPos": { + "h": 10, + "w": 12, + "x": 0, + "y": 42 + }, + "height": "", + "hiddenSeries": false, + "id": 115, + "legend": { + "alignAsTable": true, + "avg": true, + "current": true, + "hideZero": false, + "max": true, + "min": true, + "rightSide": false, + "show": true, + "sort": "current", + "sortDesc": true, + "total": false, + "values": true + }, + "lines": true, + "linewidth": 1, + "links": [], + "maxPerRow": 12, + "nullPointMode": "null", + "options": { + "dataLinks": [] + }, + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [ + { + "alias": "/.*Out.*/", + "transform": "negative-Y" + } + ], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "irate(node_netstat_Icmp_InMsgs{instance=\"$node\",job=\"$job\"}[5m])", + "format": "time_series", + "interval": "", + "intervalFactor": 2, + "legendFormat": "InMsgs - Messages which the entity received. Note that this counter includes all those counted by icmpInErrors", + "refId": "A", + "step": 4 + }, + { + "expr": "irate(node_netstat_Icmp_OutMsgs{instance=\"$node\",job=\"$job\"}[5m])", + "format": "time_series", + "interval": "", + "intervalFactor": 2, + "legendFormat": "OutMsgs - Messages which this entity attempted to send. Note that this counter includes all those counted by icmpOutErrors", + "refId": "B", + "step": 4 + } + ], + "thresholds": [], + "timeFrom": null, + "timeRegions": [], + "timeShift": null, + "title": "ICMP In / Out", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "format": "short", + "label": "messages out (-) / in (+)", + "logBase": 1, + "max": null, + "min": null, + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": false + } + ], + "yaxis": { + "align": false, + "alignLevel": null + } + }, + { + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "${DS_PROMETHEUS}", + "decimals": null, + "fill": 2, + "fillGradient": 0, + "gridPos": { + "h": 10, + "w": 12, + "x": 12, + "y": 42 + }, + "height": "", + "hiddenSeries": false, + "id": 50, + "legend": { + "alignAsTable": true, + "avg": true, + "current": true, + "hideZero": false, + "max": true, + "min": true, + "rightSide": false, + "show": true, + "sort": "current", + "sortDesc": true, + "total": false, + "values": true + }, + "lines": true, + "linewidth": 1, + "links": [], + "maxPerRow": 12, + "nullPointMode": "null", + "options": { + "dataLinks": [] + }, + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [ + { + "alias": "/.*Out.*/", + "transform": "negative-Y" + } + ], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "irate(node_netstat_Icmp_InErrors{instance=\"$node\",job=\"$job\"}[5m])", + "format": "time_series", + "interval": "", + "intervalFactor": 2, + "legendFormat": "InErrors - Messages which the entity received but determined as having ICMP-specific errors (bad ICMP checksums, bad length, etc.)", + "refId": "A", + "step": 4 + } + ], + "thresholds": [], + "timeFrom": null, + "timeRegions": [], + "timeShift": null, + "title": "ICMP Errors", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "format": "short", + "label": "messages out (-) / in (+)", + "logBase": 1, + "max": null, + "min": null, + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": false + } + ], + "yaxis": { + "align": false, + "alignLevel": null + } + }, + { + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "${DS_PROMETHEUS}", + "decimals": null, + "fill": 2, + "fillGradient": 0, + "gridPos": { + "h": 10, + "w": 12, + "x": 0, + "y": 52 + }, + "height": "", + "hiddenSeries": false, + "id": 55, + "legend": { + "alignAsTable": true, + "avg": true, + "current": true, + "hideZero": false, + "max": true, + "min": true, + "rightSide": false, + "show": true, + "sort": "current", + "sortDesc": true, + "total": false, + "values": true + }, + "lines": true, + "linewidth": 1, + "links": [], + "maxPerRow": 12, + "nullPointMode": "null", + "options": { + "dataLinks": [] + }, + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [ + { + "alias": "/.*Out.*/", + "transform": "negative-Y" + }, + { + "alias": "/.*Snd.*/", + "transform": "negative-Y" + } + ], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "irate(node_netstat_Udp_InDatagrams{instance=\"$node\",job=\"$job\"}[5m])", + "format": "time_series", + "interval": "", + "intervalFactor": 2, + "legendFormat": "InDatagrams - Datagrams received", + "refId": "A", + "step": 4 + }, + { + "expr": "irate(node_netstat_Udp_OutDatagrams{instance=\"$node\",job=\"$job\"}[5m])", + "format": "time_series", + "interval": "", + "intervalFactor": 2, + "legendFormat": "OutDatagrams - Datagrams sent", + "refId": "B", + "step": 4 + } + ], + "thresholds": [], + "timeFrom": null, + "timeRegions": [], + "timeShift": null, + "title": "UDP In / Out", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "format": "short", + "label": "datagrams out (-) / in (+)", + "logBase": 1, + "max": null, + "min": null, + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": false + } + ], + "yaxis": { + "align": false, + "alignLevel": null + } + }, + { + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "${DS_PROMETHEUS}", + "fill": 2, + "fillGradient": 0, + "gridPos": { + "h": 10, + "w": 12, + "x": 12, + "y": 52 + }, + "height": "", + "hiddenSeries": false, + "id": 109, + "legend": { + "alignAsTable": true, + "avg": true, + "current": true, + "hideZero": false, + "max": true, + "min": true, + "rightSide": false, + "show": true, + "sort": "current", + "sortDesc": true, + "total": false, + "values": true + }, + "lines": true, + "linewidth": 1, + "links": [], + "maxPerRow": 12, + "nullPointMode": "null", + "options": { + "dataLinks": [] + }, + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "irate(node_netstat_Udp_InErrors{instance=\"$node\",job=\"$job\"}[5m])", + "format": "time_series", + "interval": "", + "intervalFactor": 2, + "legendFormat": "InErrors - UDP Datagrams that could not be delivered to an application", + "refId": "A", + "step": 4 + }, + { + "expr": "irate(node_netstat_Udp_NoPorts{instance=\"$node\",job=\"$job\"}[5m])", + "format": "time_series", + "interval": "", + "intervalFactor": 2, + "legendFormat": "NoPorts - UDP Datagrams received on a port with no listener", + "refId": "B", + "step": 4 + }, + { + "expr": "irate(node_netstat_UdpLite_InErrors{instance=\"$node\",job=\"$job\"}[5m])", + "interval": "", + "legendFormat": "InErrors Lite - UDPLite Datagrams that could not be delivered to an application", + "refId": "C" + }, + { + "expr": "irate(node_netstat_Udp_RcvbufErrors{instance=\"$node\",job=\"$job\"}[5m])", + "format": "time_series", + "interval": "", + "intervalFactor": 2, + "legendFormat": "RcvbufErrors - UDP buffer errors received", + "refId": "D", + "step": 4 + }, + { + "expr": "irate(node_netstat_Udp_SndbufErrors{instance=\"$node\",job=\"$job\"}[5m])", + "format": "time_series", + "interval": "", + "intervalFactor": 2, + "legendFormat": "SndbufErrors - UDP buffer errors send", + "refId": "E", + "step": 4 + } + ], + "thresholds": [], + "timeFrom": null, + "timeRegions": [], + "timeShift": null, + "title": "UDP Errors", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "$$hashKey": "object:4232", + "format": "short", + "label": "datagrams", + "logBase": 1, + "max": null, + "min": null, + "show": true + }, + { + "$$hashKey": "object:4233", + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": false + } + ], + "yaxis": { + "align": false, + "alignLevel": null + } + }, + { + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "${DS_PROMETHEUS}", + "decimals": null, + "fill": 2, + "fillGradient": 0, + "gridPos": { + "h": 10, + "w": 12, + "x": 0, + "y": 62 + }, + "height": "", + "hiddenSeries": false, + "id": 299, + "legend": { + "alignAsTable": true, + "avg": true, + "current": true, + "hideZero": false, + "max": true, + "min": true, + "rightSide": false, + "show": true, + "sort": "current", + "sortDesc": true, + "total": false, + "values": true + }, + "lines": true, + "linewidth": 1, + "links": [], + "maxPerRow": 12, + "nullPointMode": "null", + "options": { + "dataLinks": [] + }, + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [ + { + "alias": "/.*Out.*/", + "transform": "negative-Y" + }, + { + "alias": "/.*Snd.*/", + "transform": "negative-Y" + } + ], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "irate(node_netstat_Tcp_InSegs{instance=\"$node\",job=\"$job\"}[5m])", + "format": "time_series", + "instant": false, + "interval": "", + "intervalFactor": 2, + "legendFormat": "InSegs - Segments received, including those received in error. This count includes segments received on currently established connections", + "refId": "A", + "step": 4 + }, + { + "expr": "irate(node_netstat_Tcp_OutSegs{instance=\"$node\",job=\"$job\"}[5m])", + "format": "time_series", + "interval": "", + "intervalFactor": 2, + "legendFormat": "OutSegs - Segments sent, including those on current connections but excluding those containing only retransmitted octets", + "refId": "B", + "step": 4 + } + ], + "thresholds": [], + "timeFrom": null, + "timeRegions": [], + "timeShift": null, + "title": "TCP In / Out", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "format": "short", + "label": "datagrams out (-) / in (+)", + "logBase": 1, + "max": null, + "min": null, + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": false + } + ], + "yaxis": { + "align": false, + "alignLevel": null + } + }, + { + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "${DS_PROMETHEUS}", + "description": "", + "fill": 2, + "fillGradient": 0, + "gridPos": { + "h": 10, + "w": 12, + "x": 12, + "y": 62 + }, + "height": "", + "hiddenSeries": false, + "id": 104, + "legend": { + "alignAsTable": true, + "avg": true, + "current": true, + "hideEmpty": false, + "hideZero": false, + "max": true, + "min": true, + "rightSide": false, + "show": true, + "sort": "current", + "sortDesc": true, + "total": false, + "values": true + }, + "lines": true, + "linewidth": 1, + "links": [], + "maxPerRow": 12, + "nullPointMode": "null", + "options": { + "dataLinks": [] + }, + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "irate(node_netstat_TcpExt_ListenOverflows{instance=\"$node\",job=\"$job\"}[5m])", + "format": "time_series", + "hide": false, + "interval": "", + "intervalFactor": 2, + "legendFormat": "ListenOverflows - Times the listen queue of a socket overflowed", + "refId": "A", + "step": 4 + }, + { + "expr": "irate(node_netstat_TcpExt_ListenDrops{instance=\"$node\",job=\"$job\"}[5m])", + "format": "time_series", + "hide": false, + "interval": "", + "intervalFactor": 2, + "legendFormat": "ListenDrops - SYNs to LISTEN sockets ignored", + "refId": "B", + "step": 4 + }, + { + "expr": "irate(node_netstat_TcpExt_TCPSynRetrans{instance=\"$node\",job=\"$job\"}[5m])", + "format": "time_series", + "interval": "", + "intervalFactor": 2, + "legendFormat": "TCPSynRetrans - SYN-SYN/ACK retransmits to break down retransmissions in SYN, fast/timeout retransmits", + "refId": "C", + "step": 4 + }, + { + "expr": "irate(node_netstat_Tcp_RetransSegs{instance=\"$node\",job=\"$job\"}[5m])", + "interval": "", + "legendFormat": "RetransSegs - Segments retransmitted - that is, the number of TCP segments transmitted containing one or more previously transmitted octets", + "refId": "D" + }, + { + "expr": "irate(node_netstat_Tcp_InErrs{instance=\"$node\",job=\"$job\"}[5m])", + "interval": "", + "legendFormat": "InErrs - Segments received in error (e.g., bad TCP checksums)", + "refId": "E" + } + ], + "thresholds": [], + "timeFrom": null, + "timeRegions": [], + "timeShift": null, + "title": "TCP Errors", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "format": "short", + "label": "counter", + "logBase": 1, + "max": null, + "min": "0", + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": false + } + ], + "yaxis": { + "align": false, + "alignLevel": null + } + }, + { + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "${DS_PROMETHEUS}", + "fill": 2, + "fillGradient": 0, + "gridPos": { + "h": 10, + "w": 12, + "x": 0, + "y": 72 + }, + "height": "", + "hiddenSeries": false, + "id": 85, + "legend": { + "alignAsTable": true, + "avg": true, + "current": true, + "hideZero": false, + "max": true, + "min": true, + "rightSide": false, + "show": true, + "sortDesc": true, + "total": false, + "values": true + }, + "lines": true, + "linewidth": 1, + "links": [], + "maxPerRow": 12, + "nullPointMode": "null", + "options": { + "dataLinks": [] + }, + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [ + { + "$$hashKey": "object:454", + "alias": "/.*MaxConn *./", + "color": "#890F02", + "fill": 0 + } + ], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "node_netstat_Tcp_CurrEstab{instance=\"$node\",job=\"$job\"}", + "format": "time_series", + "hide": false, + "interval": "", + "intervalFactor": 2, + "legendFormat": "CurrEstab - TCP connections for which the current state is either ESTABLISHED or CLOSE- WAIT", + "refId": "A", + "step": 4 + }, + { + "expr": "node_netstat_Tcp_MaxConn{instance=\"$node\",job=\"$job\"}", + "format": "time_series", + "hide": false, + "interval": "", + "intervalFactor": 2, + "legendFormat": "MaxConn - Limit on the total number of TCP connections the entity can support (Dinamic is \"-1\")", + "refId": "B", + "step": 4 + } + ], + "thresholds": [], + "timeFrom": null, + "timeRegions": [], + "timeShift": null, + "title": "TCP Connections", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "$$hashKey": "object:469", + "format": "short", + "label": "connections", + "logBase": 1, + "max": null, + "min": "0", + "show": true + }, + { + "$$hashKey": "object:470", + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": false + } + ], + "yaxis": { + "align": false, + "alignLevel": null + } + }, + { + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "${DS_PROMETHEUS}", + "description": "", + "fill": 2, + "fillGradient": 0, + "gridPos": { + "h": 10, + "w": 12, + "x": 12, + "y": 72 + }, + "height": "", + "hiddenSeries": false, + "id": 91, + "legend": { + "alignAsTable": true, + "avg": true, + "current": true, + "hideEmpty": false, + "hideZero": false, + "max": true, + "min": true, + "rightSide": false, + "show": true, + "sort": "current", + "sortDesc": true, + "total": false, + "values": true + }, + "lines": true, + "linewidth": 1, + "links": [], + "maxPerRow": 12, + "nullPointMode": "null", + "options": { + "dataLinks": [] + }, + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [ + { + "alias": "/.*Sent.*/", + "transform": "negative-Y" + } + ], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "irate(node_netstat_TcpExt_SyncookiesFailed{instance=\"$node\",job=\"$job\"}[5m])", + "format": "time_series", + "hide": false, + "interval": "", + "intervalFactor": 2, + "legendFormat": "SyncookiesFailed - Invalid SYN cookies received", + "refId": "A", + "step": 4 + }, + { + "expr": "irate(node_netstat_TcpExt_SyncookiesRecv{instance=\"$node\",job=\"$job\"}[5m])", + "format": "time_series", + "hide": false, + "interval": "", + "intervalFactor": 2, + "legendFormat": "SyncookiesRecv - SYN cookies received", + "refId": "B", + "step": 4 + }, + { + "expr": "irate(node_netstat_TcpExt_SyncookiesSent{instance=\"$node\",job=\"$job\"}[5m])", + "format": "time_series", + "hide": false, + "interval": "", + "intervalFactor": 2, + "legendFormat": "SyncookiesSent - SYN cookies sent", + "refId": "C", + "step": 4 + } + ], + "thresholds": [], + "timeFrom": null, + "timeRegions": [], + "timeShift": null, + "title": "TCP SynCookie", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "format": "short", + "label": "counter out (-) / in (+)", + "logBase": 1, + "max": null, + "min": null, + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": false + } + ], + "yaxis": { + "align": false, + "alignLevel": null + } + }, + { + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "${DS_PROMETHEUS}", + "fill": 2, + "fillGradient": 0, + "gridPos": { + "h": 10, + "w": 12, + "x": 0, + "y": 82 + }, + "height": "", + "hiddenSeries": false, + "id": 82, + "legend": { + "alignAsTable": true, + "avg": true, + "current": true, + "hideZero": false, + "max": true, + "min": true, + "rightSide": false, + "show": true, + "sortDesc": true, + "total": false, + "values": true + }, + "lines": true, + "linewidth": 1, + "links": [], + "maxPerRow": 12, + "nullPointMode": "null", + "options": { + "dataLinks": [] + }, + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "irate(node_netstat_Tcp_ActiveOpens{instance=\"$node\",job=\"$job\"}[5m])", + "format": "time_series", + "interval": "", + "intervalFactor": 2, + "legendFormat": "ActiveOpens - TCP connections that have made a direct transition to the SYN-SENT state from the CLOSED state", + "refId": "A", + "step": 4 + }, + { + "expr": "irate(node_netstat_Tcp_PassiveOpens{instance=\"$node\",job=\"$job\"}[5m])", + "format": "time_series", + "interval": "", + "intervalFactor": 2, + "legendFormat": "PassiveOpens - TCP connections that have made a direct transition to the SYN-RCVD state from the LISTEN state", + "refId": "B", + "step": 4 + } + ], + "thresholds": [], + "timeFrom": null, + "timeRegions": [], + "timeShift": null, + "title": "TCP Direct Transition", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "format": "short", + "label": "connections", + "logBase": 1, + "max": null, + "min": "0", + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": false + } + ], + "yaxis": { + "align": false, + "alignLevel": null + } + } + ], + "repeat": null, + "title": "Network Netstat", + "type": "row" + }, + { + "collapsed": true, + "datasource": "${DS_PROMETHEUS}", + "gridPos": { + "h": 1, + "w": 24, + "x": 0, + "y": 33 + }, + "id": 279, + "panels": [ + { + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "${DS_PROMETHEUS}", + "description": "", + "fill": 2, + "fillGradient": 0, + "gridPos": { + "h": 10, + "w": 12, + "x": 0, + "y": 54 + }, + "hiddenSeries": false, + "id": 40, + "legend": { + "alignAsTable": true, + "avg": true, + "current": true, + "max": true, + "min": true, + "show": true, + "sort": "current", + "sortDesc": true, + "total": false, + "values": true + }, + "lines": true, + "linewidth": 1, + "links": [], + "nullPointMode": "null", + "options": { + "dataLinks": [] + }, + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [], + "spaceLength": 10, + "stack": true, + "steppedLine": false, + "targets": [ + { + "expr": "node_scrape_collector_duration_seconds{instance=\"$node\",job=\"$job\"}", + "format": "time_series", + "hide": false, + "interval": "", + "intervalFactor": 2, + "legendFormat": "{{collector}} - Scrape duration", + "refId": "A", + "step": 4 + } + ], + "thresholds": [], + "timeFrom": null, + "timeRegions": [], + "timeShift": null, + "title": "Node Exporter Scrape Time", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "format": "s", + "label": "seconds", + "logBase": 1, + "max": null, + "min": null, + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": false + } + ], + "yaxis": { + "align": false, + "alignLevel": null + } + }, + { + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "${DS_PROMETHEUS}", + "description": "", + "fill": 2, + "fillGradient": 0, + "gridPos": { + "h": 10, + "w": 12, + "x": 12, + "y": 54 + }, + "hiddenSeries": false, + "id": 157, + "legend": { + "alignAsTable": true, + "avg": true, + "current": true, + "max": true, + "min": true, + "show": true, + "total": false, + "values": true + }, + "lines": true, + "linewidth": 1, + "links": [], + "nullPointMode": "null", + "options": { + "dataLinks": [] + }, + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [ + { + "$$hashKey": "object:1969", + "alias": "/.*error.*/", + "color": "#F2495C", + "transform": "negative-Y" + } + ], + "spaceLength": 10, + "stack": true, + "steppedLine": false, + "targets": [ + { + "expr": "node_scrape_collector_success{instance=\"$node\",job=\"$job\"}", + "format": "time_series", + "hide": false, + "interval": "", + "intervalFactor": 2, + "legendFormat": "{{collector}} - Scrape success", + "refId": "A", + "step": 4 + }, + { + "expr": "node_textfile_scrape_error{instance=\"$node\",job=\"$job\"}", + "format": "time_series", + "hide": false, + "interval": "", + "intervalFactor": 2, + "legendFormat": "{{collector}} - Scrape textfile error (1 = true)", + "refId": "B", + "step": 4 + } + ], + "thresholds": [], + "timeFrom": null, + "timeRegions": [], + "timeShift": null, + "title": "Node Exporter Scrape", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "$$hashKey": "object:1484", + "format": "short", + "label": "counter", + "logBase": 1, + "max": null, + "min": null, + "show": true + }, + { + "$$hashKey": "object:1485", + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": false + } + ], + "yaxis": { + "align": false, + "alignLevel": null + } + } + ], + "repeat": null, + "title": "Node Exporter", + "type": "row" + } + ], + "refresh": "1m", + "schemaVersion": 22, + "style": "dark", + "tags": [ + "linux" + ], + "templating": { + "list": [ + { + "current": { + "selected": false, + "text": "default", + "value": "default" + }, + "hide": 0, + "includeAll": false, + "label": "datasource", + "multi": false, + "name": "DS_PROMETHEUS", + "options": [], + "query": "prometheus", + "refresh": 1, + "regex": "", + "skipUrlSync": false, + "type": "datasource" + }, + { + "allValue": null, + "current": {}, + "datasource": "${DS_PROMETHEUS}", + "definition": "", + "hide": 0, + "includeAll": false, + "index": -1, + "label": "Job", + "multi": false, + "name": "job", + "options": [], + "query": "label_values(node_uname_info, job)", + "refresh": 1, + "regex": "", + "skipUrlSync": false, + "sort": 1, + "tagValuesQuery": "", + "tags": [], + "tagsQuery": "", + "type": "query", + "useTags": false + }, + { + "allValue": null, + "current": {}, + "datasource": "${DS_PROMETHEUS}", + "definition": "label_values(node_uname_info{job=\"$job\"}, instance)", + "hide": 0, + "includeAll": false, + "index": -1, + "label": "Host:", + "multi": false, + "name": "node", + "options": [], + "query": "label_values(node_uname_info{job=\"$job\"}, instance)", + "refresh": 1, + "regex": "", + "skipUrlSync": false, + "sort": 1, + "tagValuesQuery": "", + "tags": [], + "tagsQuery": "", + "type": "query", + "useTags": false + }, + { + "allValue": null, + "current": { + "selected": false, + "text": "[a-z]+|nvme[0-9]+n[0-9]+", + "value": "[a-z]+|nvme[0-9]+n[0-9]+" + }, + "hide": 2, + "includeAll": false, + "label": null, + "multi": false, + "name": "diskdevices", + "options": [ + { + "selected": true, + "text": "[a-z]+|nvme[0-9]+n[0-9]+", + "value": "[a-z]+|nvme[0-9]+n[0-9]+" + } + ], + "query": "[a-z]+|nvme[0-9]+n[0-9]+", + "skipUrlSync": false, + "type": "custom" + } + ] + }, + "time": { + "from": "now-4h", + "to": "now" + }, + "timepicker": { + "refresh_intervals": [ + "5s", + "10s", + "30s", + "1m", + "5m", + "15m", + "30m", + "1h", + "2h", + "1d" + ], + "time_options": [ + "5m", + "15m", + "1h", + "6h", + "12h", + "24h", + "2d", + "7d", + "30d" + ] + }, + "timezone": "browser", + "title": "Node Exporter", + "version": 1 +} \ No newline at end of file diff --git a/terraform-ci-infra/1n_nmd/grafana/conf/nomad.json b/terraform-ci-infra/1n_nmd/grafana/conf/nomad.json new file mode 100644 index 0000000000..40ffeddf7b --- /dev/null +++ b/terraform-ci-infra/1n_nmd/grafana/conf/nomad.json @@ -0,0 +1,869 @@ +{ + "__inputs": [ + { + "name": "DS_PROMETHEUS", + "label": "Prometheus", + "description": "", + "type": "datasource", + "pluginId": "prometheus", + "pluginName": "Prometheus" + } + ], + "__requires": [ + { + "type": "grafana", + "id": "grafana", + "name": "Grafana", + "version": "5.3.2" + }, + { + "type": "panel", + "id": "graph", + "name": "Graph", + "version": "5.0.0" + }, + { + "type": "datasource", + "id": "prometheus", + "name": "Prometheus", + "version": "5.0.0" + }, + { + "type": "panel", + "id": "singlestat", + "name": "Singlestat", + "version": "5.0.0" + } + ], + "annotations": { + "list": [ + { + "builtIn": 1, + "datasource": "-- Grafana --", + "enable": true, + "hide": true, + "iconColor": "rgba(0, 211, 255, 1)", + "name": "Annotations & Alerts", + "type": "dashboard" + } + ] + }, + "description": "Nomad Jobs metrics", + "editable": true, + "gnetId": 12787, + "graphTooltip": 0, + "id": null, + "iteration": 1596708119930, + "links": [], + "panels": [ + { + "cacheTimeout": null, + "colorBackground": false, + "colorValue": false, + "colors": [ + "#299c46", + "rgba(237, 129, 40, 0.89)", + "#d44a3a" + ], + "datasource": "${DS_PROMETHEUS}", + "decimals": null, + "format": "dtdurations", + "gauge": { + "maxValue": 100, + "minValue": 0, + "show": false, + "thresholdLabels": false, + "thresholdMarkers": true + }, + "gridPos": { + "h": 4, + "w": 4, + "x": 0, + "y": 0 + }, + "id": 16, + "interval": null, + "links": [], + "mappingType": 1, + "mappingTypes": [ + { + "name": "value to text", + "value": 1 + }, + { + "name": "range to text", + "value": 2 + } + ], + "maxDataPoints": 100, + "nullPointMode": "connected", + "nullText": null, + "postfix": "", + "postfixFontSize": "50%", + "prefix": "", + "prefixFontSize": "50%", + "rangeMaps": [ + { + "from": "null", + "text": "N/A", + "to": "null" + } + ], + "sparkline": { + "fillColor": "rgba(31, 118, 189, 0.18)", + "full": false, + "lineColor": "rgb(31, 120, 193)", + "show": false + }, + "tableColumn": "", + "targets": [ + { + "alias": "", + "expr": "max(nomad_client_uptime{instance=~\"$instance\"})", + "format": "time_series", + "intervalFactor": 1, + "legendFormat": "", + "rawSql": "SELECT\n UNIX_TIMESTAMP() as time_sec,\n as value,\n as metric\nFROM \nWHERE $__timeFilter(time_column)\nORDER BY ASC\n", + "refId": "A" + } + ], + "thresholds": "", + "title": "Uptime", + "type": "singlestat", + "valueFontSize": "80%", + "valueMaps": [ + { + "op": "=", + "text": "N/A", + "value": "null" + } + ], + "valueName": "current" + }, + { + "cacheTimeout": null, + "colorBackground": false, + "colorPrefix": false, + "colorValue": false, + "colors": [ + "#7eb26d", + "rgba(237, 129, 40, 0.89)", + "#d44a3a" + ], + "datasource": "${DS_PROMETHEUS}", + "format": "none", + "gauge": { + "maxValue": 100, + "minValue": 0, + "show": false, + "thresholdLabels": false, + "thresholdMarkers": true + }, + "gridPos": { + "h": 4, + "w": 4, + "x": 4, + "y": 0 + }, + "id": 17, + "interval": null, + "links": [], + "mappingType": 1, + "mappingTypes": [ + { + "name": "value to text", + "value": 1 + }, + { + "name": "range to text", + "value": 2 + } + ], + "maxDataPoints": 100, + "nullPointMode": "connected", + "nullText": null, + "postfix": "", + "postfixFontSize": "50%", + "prefix": "", + "prefixFontSize": "50%", + "rangeMaps": [ + { + "from": "null", + "text": "N/A", + "to": "null" + } + ], + "sparkline": { + "fillColor": "rgba(31, 118, 189, 0.18)", + "full": false, + "lineColor": "rgb(31, 120, 193)", + "show": true + }, + "tableColumn": "", + "targets": [ + { + "alias": "", + "expr": "count(sum(nomad_client_allocs_memory_cache) by (exported_job))", + "format": "time_series", + "intervalFactor": 1, + "legendFormat": "", + "rawSql": "SELECT\n UNIX_TIMESTAMP() as time_sec,\n as value,\n as metric\nFROM
\nWHERE $__timeFilter(time_column)\nORDER BY ASC\n", + "refId": "A" + } + ], + "thresholds": "", + "title": "Jobs", + "type": "singlestat", + "valueFontSize": "80%", + "valueMaps": [ + { + "op": "=", + "text": "N/A", + "value": "null" + } + ], + "valueName": "current" + }, + { + "cacheTimeout": null, + "colorBackground": false, + "colorPrefix": false, + "colorValue": false, + "colors": [ + "#7eb26d", + "rgba(237, 129, 40, 0.89)", + "#d44a3a" + ], + "datasource": "${DS_PROMETHEUS}", + "format": "none", + "gauge": { + "maxValue": 100, + "minValue": 0, + "show": false, + "thresholdLabels": false, + "thresholdMarkers": true + }, + "gridPos": { + "h": 4, + "w": 4, + "x": 8, + "y": 0 + }, + "id": 12, + "interval": null, + "links": [], + "mappingType": 1, + "mappingTypes": [ + { + "name": "value to text", + "value": 1 + }, + { + "name": "range to text", + "value": 2 + } + ], + "maxDataPoints": 100, + "nullPointMode": "connected", + "nullText": null, + "postfix": "", + "postfixFontSize": "50%", + "prefix": "", + "prefixFontSize": "50%", + "rangeMaps": [ + { + "from": "null", + "text": "N/A", + "to": "null" + } + ], + "sparkline": { + "fillColor": "rgba(31, 118, 189, 0.18)", + "full": false, + "lineColor": "rgb(31, 120, 193)", + "show": true + }, + "tableColumn": "", + "targets": [ + { + "alias": "", + "expr": "sum(nomad_client_allocations_running{datacenter=\"$datacenter\",instance=~\"$instance\"})", + "format": "time_series", + "intervalFactor": 1, + "legendFormat": "", + "rawSql": "SELECT\n UNIX_TIMESTAMP() as time_sec,\n as value,\n as metric\nFROM
\nWHERE $__timeFilter(time_column)\nORDER BY ASC\n", + "refId": "A" + } + ], + "thresholds": "", + "title": "Allocs", + "type": "singlestat", + "valueFontSize": "80%", + "valueMaps": [ + { + "op": "=", + "text": "N/A", + "value": "null" + } + ], + "valueName": "current" + }, + { + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "${DS_PROMETHEUS}", + "fill": 1, + "gridPos": { + "h": 4, + "w": 12, + "x": 12, + "y": 0 + }, + "id": 14, + "legend": { + "alignAsTable": true, + "avg": false, + "current": true, + "max": false, + "min": false, + "rightSide": true, + "show": true, + "total": false, + "values": true + }, + "lines": true, + "linewidth": 1, + "links": [], + "nullPointMode": "null", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "alias": "", + "expr": "sum(nomad_client_allocations_blocked{datacenter=\"$datacenter\",instance=~\"$instance\"})", + "format": "time_series", + "hide": false, + "intervalFactor": 1, + "legendFormat": "Blocked", + "rawSql": "SELECT\n UNIX_TIMESTAMP() as time_sec,\n as value,\n as metric\nFROM
\nWHERE $__timeFilter(time_column)\nORDER BY ASC\n", + "refId": "A" + }, + { + "expr": "sum(nomad_client_allocations_pending{datacenter=\"$datacenter\",instance=~\"$instance\"})", + "format": "time_series", + "hide": false, + "intervalFactor": 1, + "legendFormat": "Pending", + "refId": "B" + }, + { + "expr": "sum(nomad_client_allocations_restart{datacenter=\"$datacenter\",instance=~\"$instance\"})", + "format": "time_series", + "hide": false, + "intervalFactor": 1, + "legendFormat": "Restart ", + "refId": "C" + } + ], + "thresholds": [], + "timeFrom": null, + "timeShift": null, + "title": "Block/Peding/Restart", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + } + ], + "yaxis": { + "align": false, + "alignLevel": null + } + }, + { + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "${DS_PROMETHEUS}", + "fill": 1, + "gridPos": { + "h": 6, + "w": 12, + "x": 0, + "y": 13 + }, + "id": 2, + "legend": { + "avg": false, + "current": false, + "max": false, + "min": false, + "show": true, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 1, + "links": [], + "nullPointMode": "null", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "repeat": "host", + "repeatDirection": "v", + "seriesOverrides": [], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "rate(nomad_client_allocs_cpu_total_percent[5m:10s]) > 1", + "format": "time_series", + "interval": "", + "intervalFactor": 1, + "legendFormat": "{{task}}", + "refId": "A" + } + ], + "thresholds": [], + "timeFrom": null, + "timeShift": null, + "title": "CPU Usage Percent", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "decimals": 3, + "format": "percentunit", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + } + ], + "yaxis": { + "align": false, + "alignLevel": null + } + }, + { + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "${DS_PROMETHEUS}", + "fill": 1, + "gridPos": { + "h": 6, + "w": 12, + "x": 12, + "y": 13 + }, + "id": 3, + "legend": { + "avg": false, + "current": false, + "max": false, + "min": false, + "show": true, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 1, + "links": [], + "nullPointMode": "null", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "repeat": "host", + "repeatDirection": "v", + "seriesOverrides": [], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "avg(nomad_client_allocs_cpu_total_ticks{instance=~\"$instance\"}) by(exported_job, task)", + "format": "time_series", + "interval": "", + "intervalFactor": 1, + "legendFormat": "{{task}}", + "refId": "A" + } + ], + "thresholds": [], + "timeFrom": null, + "timeShift": null, + "title": "CPU Total Ticks", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "decimals": 3, + "format": "timeticks", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + } + ], + "yaxis": { + "align": false, + "alignLevel": null + } + }, + { + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "${DS_PROMETHEUS}", + "fill": 1, + "gridPos": { + "h": 6, + "w": 12, + "x": 0, + "y": 19 + }, + "id": 6, + "legend": { + "avg": false, + "current": false, + "max": false, + "min": false, + "show": true, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 1, + "links": [], + "nullPointMode": "null", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "repeat": "host", + "repeatDirection": "v", + "seriesOverrides": [], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "avg(nomad_client_allocs_memory_rss{instance=~\"$instance\"}) by(exported_job, task)", + "format": "time_series", + "interval": "", + "intervalFactor": 1, + "legendFormat": "{{task}}", + "refId": "A" + } + ], + "thresholds": [], + "timeFrom": null, + "timeShift": null, + "title": "RSS", + "tooltip": { + "shared": false, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "decimals": 3, + "format": "decbytes", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + } + ], + "yaxis": { + "align": false, + "alignLevel": null + } + }, + { + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "${DS_PROMETHEUS}", + "fill": 1, + "gridPos": { + "h": 6, + "w": 12, + "x": 12, + "y": 19 + }, + "id": 7, + "legend": { + "avg": false, + "current": false, + "max": false, + "min": false, + "show": true, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 1, + "links": [], + "nullPointMode": "null", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "repeat": "host", + "repeatDirection": "v", + "seriesOverrides": [], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "avg(nomad_client_allocs_memory_cache{instance=~\"$instance\"}) by(exported_job, task)", + "format": "time_series", + "interval": "", + "intervalFactor": 1, + "legendFormat": "{{task}}", + "refId": "A" + } + ], + "thresholds": [], + "timeFrom": null, + "timeShift": null, + "title": "Memory Cache", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "decimals": 3, + "format": "decbytes", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + } + ], + "yaxis": { + "align": false, + "alignLevel": null + } + } + ], + "refresh": false, + "schemaVersion": 16, + "style": "dark", + "tags": [], + "templating": { + "list": [ + { + "current": { + "selected": false, + "text": "default", + "value": "default" + }, + "hide": 0, + "includeAll": false, + "label": "datasource", + "multi": false, + "name": "DS_PROMETHEUS", + "options": [], + "query": "prometheus", + "refresh": 1, + "regex": "", + "skipUrlSync": false, + "type": "datasource" + }, + { + "allValue": null, + "current": {}, + "datasource": "${DS_PROMETHEUS}", + "hide": 0, + "includeAll": false, + "label": "DC", + "multi": false, + "name": "datacenter", + "options": [], + "query": "label_values(nomad_client_uptime, datacenter)", + "refresh": 1, + "regex": "", + "skipUrlSync": false, + "sort": 0, + "tagValuesQuery": "", + "tags": [], + "tagsQuery": "", + "type": "query", + "useTags": false + }, + { + "allValue": null, + "current": {}, + "datasource": "${DS_PROMETHEUS}", + "hide": 0, + "includeAll": true, + "label": "Host", + "multi": true, + "name": "instance", + "options": [], + "query": "label_values(nomad_client_uptime{datacenter=~\"$datacenter\"}, instance)", + "refresh": 2, + "regex": "", + "skipUrlSync": false, + "sort": 0, + "tagValuesQuery": "", + "tags": [], + "tagsQuery": "", + "type": "query", + "useTags": false + } + ] + }, + "time": { + "from": "now-4h", + "to": "now" + }, + "timepicker": { + "refresh_intervals": [ + "5s", + "10s", + "30s", + "1m", + "5m", + "15m", + "30m", + "1h", + "2h", + "1d" + ], + "time_options": [ + "5m", + "15m", + "1h", + "6h", + "12h", + "24h", + "2d", + "7d", + "30d" + ] + }, + "timezone": "browser", + "title": "Nomad", + "version": 1 +} \ No newline at end of file diff --git a/terraform-ci-infra/1n_nmd/grafana/conf/nomad/grafana.hcl b/terraform-ci-infra/1n_nmd/grafana/conf/nomad/grafana.hcl new file mode 100644 index 0000000000..7325c6aef4 --- /dev/null +++ b/terraform-ci-infra/1n_nmd/grafana/conf/nomad/grafana.hcl @@ -0,0 +1,331 @@ +job "${job_name}" { + # The "region" parameter specifies the region in which to execute the job. + # If omitted, this inherits the default region name of "global". + # region = "global" + # + # The "datacenters" parameter specifies the list of datacenters which should + # be considered when placing this task. This must be provided. + datacenters = "${datacenters}" + + # The "type" parameter controls the type of job, which impacts the scheduler's + # decision on placement. This configuration is optional and defaults to + # "service". For a full list of job types and their differences, please see + # the online documentation. + # + # For more information, please see the online documentation at: + # + # https://www.nomadproject.io/docs/jobspec/schedulers + # + type = "service" + + update { + # The "max_parallel" parameter specifies the maximum number of updates to + # perform in parallel. In this case, this specifies to update a single task + # at a time. + max_parallel = 1 + + health_check = "checks" + + # The "min_healthy_time" parameter specifies the minimum time the allocation + # must be in the healthy state before it is marked as healthy and unblocks + # further allocations from being updated. + min_healthy_time = "10s" + + # The "healthy_deadline" parameter specifies the deadline in which the + # allocation must be marked as healthy after which the allocation is + # automatically transitioned to unhealthy. Transitioning to unhealthy will + # fail the deployment and potentially roll back the job if "auto_revert" is + # set to true. + healthy_deadline = "3m" + + # The "progress_deadline" parameter specifies the deadline in which an + # allocation must be marked as healthy. The deadline begins when the first + # allocation for the deployment is created and is reset whenever an allocation + # as part of the deployment transitions to a healthy state. If no allocation + # transitions to the healthy state before the progress deadline, the + # deployment is marked as failed. + progress_deadline = "10m" + +%{ if use_canary } + # The "canary" parameter specifies that changes to the job that would result + # in destructive updates should create the specified number of canaries + # without stopping any previous allocations. Once the operator determines the + # canaries are healthy, they can be promoted which unblocks a rolling update + # of the remaining allocations at a rate of "max_parallel". + # + # Further, setting "canary" equal to the count of the task group allows + # blue/green deployments. When the job is updated, a full set of the new + # version is deployed and upon promotion the old version is stopped. + canary = 1 + + # Specifies if the job should auto-promote to the canary version when all + # canaries become healthy during a deployment. Defaults to false which means + # canaries must be manually updated with the nomad deployment promote + # command. + auto_promote = true + + # The "auto_revert" parameter specifies if the job should auto-revert to the + # last stable job on deployment failure. A job is marked as stable if all the + # allocations as part of its deployment were marked healthy. + auto_revert = true +%{ endif } + } + + # The "group" stanza defines a series of tasks that should be co-located on + # the same Nomad client. Any task within a group will be placed on the same + # client. + # + # For more information and examples on the "group" stanza, please see + # the online documentation at: + # + # https://www.nomadproject.io/docs/job-specification/group + # + group "prod-group1-${service_name}" { + # The "count" parameter specifies the number of the task groups that should + # be running under this group. This value must be non-negative and defaults + # to 1. + count = ${group_count} + + + # The constraint allows restricting the set of eligible nodes. Constraints + # may filter on attributes or client metadata. + # + # For more information and examples on the "volume" stanza, please see + # the online documentation at: + # + # https://www.nomadproject.io/docs/job-specification/constraint + # + constraint { + attribute = "$${attr.cpu.arch}" + operator = "!=" + value = "arm64" + } + + # The "task" stanza creates an individual unit of work, such as a Docker + # container, web application, or batch processing. + # + # For more information and examples on the "task" stanza, please see + # the online documentation at: + # + # https://www.nomadproject.io/docs/job-specification/task + # + task "prod-task1-${service_name}" { + # The "driver" parameter specifies the task driver that should be used to + # run the task. + driver = "docker" + + # The "config" stanza specifies the driver configuration, which is passed + # directly to the driver to start the task. The details of configurations + # are specific to each driver, so please see specific driver + # documentation for more information. + config { + image = "${image}" + dns_servers = [ "$${attr.unique.network.ip-address}" ] + volumes = [ + "secrets/prometheus.yml:/etc/grafana/provisioning/datasources/prometheus.yml", + "secrets/dashboards.yml:/etc/grafana/provisioning/dashboards/dashboards.yml", + "secrets/grafana.ini:/etc/grafana/grafana.ini", + "secrets/node_exporter.json:/etc/grafana/provisioning/dashboards/node_exporter.json", + "secrets/docker_cadvisor.json:/etc/grafana/provisioning/dashboards/docker_cadvisor.json", + "secrets/nomad.json:/etc/grafana/provisioning/dashboards/nomad.json", + "secrets/consul.json:/etc/grafana/provisioning/dashboards/consul.json", + "secrets/prometheus.json:/etc/grafana/provisioning/dashboards/prometheus.json", + "secrets/blackbox_exporter_http.json:/etc/grafana/provisioning/dashboards/blackbox_exporter_http.json", + "secrets/blackbox_exporter_icmp.json:/etc/grafana/provisioning/dashboards/blackbox_exporter_icmp.json" + ] + } + + artifact { + # Prometheus Node Exporter + source = "https://raw.githubusercontent.com/pmikus/grafana-dashboards/main/node_exporter.json" + destination = "secrets/" + } + + artifact { + # Docker cAdvisor + source = "https://raw.githubusercontent.com/pmikus/grafana-dashboards/main/docker_cadvisor.json" + destination = "secrets/" + } + + artifact { + # Nomad + source = "https://raw.githubusercontent.com/pmikus/grafana-dashboards/main/nomad.json" + destination = "secrets/" + } + + artifact { + # Consul + source = "https://raw.githubusercontent.com/pmikus/grafana-dashboards/main/consul.json" + destination = "secrets/" + } + + artifact { + # Prometheus + source = "https://raw.githubusercontent.com/pmikus/grafana-dashboards/main/prometheus.json" + destination = "secrets/" + } + + artifact { + # Prometheus Blackbox Exporter HTTP + source = "https://raw.githubusercontent.com/pmikus/grafana-dashboards/main/blackbox_exporter_http.json" + destination = "secrets/" + } + + artifact { + # Prometheus Blackbox Exporter ICMP + source = "https://raw.githubusercontent.com/pmikus/grafana-dashboards/main/blackbox_exporter_icmp.json" + destination = "secrets/" + } + + # The "template" stanza instructs Nomad to manage a template, such as + # a configuration file or script. This template can optionally pull data + # from Consul or Vault to populate runtime configuration data. + # + # For more information and examples on the "template" stanza, please see + # the online documentation at: + # + # https://www.nomadproject.io/docs/job-specification/template + # + template { + change_mode = "noop" + change_signal = "SIGINT" + destination = "secrets/prometheus.yml" + data = < 0", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "Failed Connections", + "refId": "A", + "step": 2 + }, + { + "expr": "sum(increase(prometheus_evaluator_iterations_missed_total{instance=~\"$instance\"}[5m])) > 0", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "Missed Iterations", + "refId": "B", + "step": 2 + }, + { + "expr": "sum(increase(prometheus_evaluator_iterations_skipped_total{instance=~\"$instance\"}[5m])) > 0", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "Skipped Iterations", + "refId": "C", + "step": 2 + }, + { + "expr": "sum(increase(prometheus_rule_evaluation_failures_total{instance=~\"$instance\"}[5m])) > 0", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "Evaluation", + "refId": "D", + "step": 2 + }, + { + "expr": "sum(increase(prometheus_sd_azure_refresh_failures_total{instance=~\"$instance\"}[5m])) > 0", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "Azure Refresh", + "refId": "E", + "step": 2 + }, + { + "expr": "sum(increase(prometheus_sd_consul_rpc_failures_total{instance=~\"$instance\"}[5m])) > 0", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "Consul RPC", + "refId": "F", + "step": 2 + }, + { + "expr": "sum(increase(prometheus_sd_dns_lookup_failures_total{instance=~\"$instance\"}[5m])) > 0", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "DNS Lookup", + "refId": "G", + "step": 2 + }, + { + "expr": "sum(increase(prometheus_sd_ec2_refresh_failures_total{instance=~\"$instance\"}[5m])) > 0", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "EC2 Refresh", + "refId": "H", + "step": 2 + }, + { + "expr": "sum(increase(prometheus_sd_gce_refresh_failures_total{instance=~\"$instance\"}[5m])) > 0", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "GCE Refresh", + "refId": "I", + "step": 2 + }, + { + "expr": "sum(increase(prometheus_sd_marathon_refresh_failures_total{instance=~\"$instance\"}[5m])) > 0", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "Marathon Refresh", + "refId": "J", + "step": 2 + }, + { + "expr": "sum(increase(prometheus_sd_openstack_refresh_failures_total{instance=~\"$instance\"}[5m])) > 0", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "Openstack Refresh", + "refId": "K", + "step": 2 + }, + { + "expr": "sum(increase(prometheus_sd_triton_refresh_failures_total{instance=~\"$instance\"}[5m])) > 0", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "Triton Refresh", + "refId": "L", + "step": 2 + }, + { + "expr": "sum(increase(prometheus_target_scrapes_exceeded_sample_limit_total{instance=~\"$instance\"}[5m])) > 0", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "Sample Limit", + "refId": "M", + "step": 2 + }, + { + "expr": "sum(increase(prometheus_target_scrapes_sample_duplicate_timestamp_total{instance=~\"$instance\"}[5m])) > 0", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "Duplicate Timestamp", + "refId": "N", + "step": 2 + }, + { + "expr": "sum(increase(prometheus_target_scrapes_sample_out_of_bounds_total{instance=~\"$instance\"}[5m])) > 0", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "Timestamp Out of Bounds", + "refId": "O", + "step": 2 + }, + { + "expr": "sum(increase(prometheus_target_scrapes_sample_out_of_order_total{instance=~\"$instance\"}[5m])) > 0", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "Sample Out of Order", + "refId": "P", + "step": 2 + }, + { + "expr": "sum(increase(prometheus_treecache_zookeeper_failures_total{instance=~\"$instance\"}[5m])) > 0", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "Zookeeper", + "refId": "Q", + "step": 2 + }, + { + "expr": "sum(increase(prometheus_tsdb_compactions_failed_total{instance=~\"$instance\"}[5m])) > 0", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "TSDB Compactions", + "refId": "R", + "step": 2 + }, + { + "expr": "sum(increase(prometheus_tsdb_head_series_not_found{instance=~\"$instance\"}[5m])) > 0", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "Series Not Found", + "refId": "S", + "step": 2 + }, + { + "expr": "sum(increase(prometheus_tsdb_reloads_failures_total{instance=~\"$instance\"}[5m])) > 0", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "Reload", + "refId": "T", + "step": 2 + } + ], + "thresholds": [], + "timeFrom": null, + "timeShift": null, + "title": "Failures and Errors", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "format": "short", + "label": "Errors", + "logBase": 1, + "max": null, + "min": "0", + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + } + ] + } + ], + "repeat": null, + "repeatIteration": null, + "repeatRowId": null, + "showTitle": false, + "title": "errors", + "titleSize": "h6" + }, + { + "collapse": false, + "height": "250px", + "panels": [ + { + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "${DS_PROMETHEUS}", + "fill": 1, + "id": 1, + "legend": { + "avg": false, + "current": false, + "max": false, + "min": false, + "show": true, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 1, + "links": [], + "nullPointMode": "null", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [], + "spaceLength": 10, + "span": 6, + "stack": true, + "steppedLine": false, + "targets": [ + { + "expr": "up{instance=~\"$instance\",job=~\"$job\"}", + "format": "time_series", + "interval": "", + "intervalFactor": 1, + "legendFormat": "{{instance}}", + "refId": "A", + "step": 2 + } + ], + "thresholds": [], + "timeFrom": null, + "timeShift": null, + "title": "Upness (stacked)", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "decimals": 0, + "format": "none", + "label": "Up", + "logBase": 1, + "max": null, + "min": "0", + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": false + } + ] + }, + { + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "${DS_PROMETHEUS}", + "fill": 1, + "id": 5, + "legend": { + "avg": false, + "current": false, + "max": false, + "min": false, + "show": true, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 1, + "links": [], + "nullPointMode": "null", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [], + "spaceLength": 10, + "span": 6, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "prometheus_tsdb_head_chunks{job=~\"$job\",instance=~\"$instance\"}", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "{{instance}}", + "refId": "A", + "step": 4 + } + ], + "thresholds": [], + "timeFrom": null, + "timeShift": null, + "title": "Storage Memory Chunks", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "format": "short", + "label": "Chunks", + "logBase": 1, + "max": null, + "min": "0", + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + } + ] + } + ], + "repeat": null, + "repeatIteration": null, + "repeatRowId": null, + "showTitle": false, + "title": "up", + "titleSize": "h6" + }, + { + "collapse": false, + "height": 250, + "panels": [ + { + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "${DS_PROMETHEUS}", + "fill": 1, + "id": 3, + "legend": { + "avg": false, + "current": false, + "max": false, + "min": false, + "show": true, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 1, + "links": [], + "nullPointMode": "null", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [], + "spaceLength": 10, + "span": 6, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "prometheus_tsdb_head_series{job=~\"$job\",instance=~\"$instance\"}", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "{{instance}}", + "refId": "A", + "step": 4 + } + ], + "thresholds": [], + "timeFrom": null, + "timeShift": null, + "title": "Series Count", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "format": "short", + "label": "Series", + "logBase": 1, + "max": null, + "min": "0", + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + } + ] + }, + { + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "${DS_PROMETHEUS}", + "fill": 1, + "id": 32, + "legend": { + "avg": false, + "current": false, + "max": false, + "min": false, + "show": true, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 1, + "links": [], + "nullPointMode": "null", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [ + { + "alias": "removed", + "transform": "negative-Y" + } + ], + "spaceLength": 10, + "span": 6, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "sum( increase(prometheus_tsdb_head_series_created_total{instance=~\"$instance\"}[5m]) )", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "created", + "refId": "A", + "step": 4 + }, + { + "expr": "sum( increase(prometheus_tsdb_head_series_removed_total{instance=~\"$instance\"}[5m]) )", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "removed", + "refId": "B", + "step": 4 + } + ], + "thresholds": [], + "timeFrom": null, + "timeShift": null, + "title": "Series Created / Removed", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "format": "short", + "label": "Series Count", + "logBase": 1, + "max": null, + "min": null, + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + } + ] + } + ], + "repeat": null, + "repeatIteration": null, + "repeatRowId": null, + "showTitle": false, + "title": "series", + "titleSize": "h6" + }, + { + "collapse": false, + "height": 250, + "panels": [ + { + "aliasColors": { + "10.58.3.10:80": "#BA43A9" + }, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "${DS_PROMETHEUS}", + "description": "Rate of total number of appended samples", + "fill": 1, + "id": 4, + "legend": { + "avg": false, + "current": false, + "max": false, + "min": false, + "show": true, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 1, + "links": [], + "nullPointMode": "null", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [], + "spaceLength": 10, + "span": 12, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "rate(prometheus_tsdb_head_samples_appended_total{job=~\"$job\",instance=~\"$instance\"}[1m])", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "{{instance}}", + "refId": "A", + "step": 2 + } + ], + "thresholds": [], + "timeFrom": null, + "timeShift": null, + "title": "Appended Samples per Second", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "format": "short", + "label": "Samples / Second", + "logBase": 1, + "max": null, + "min": "0", + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + } + ] + } + ], + "repeat": null, + "repeatIteration": null, + "repeatRowId": null, + "showTitle": false, + "title": "appended samples", + "titleSize": "h6" + }, + { + "collapse": false, + "height": 250, + "panels": [ + { + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "${DS_PROMETHEUS}", + "description": "Total number of syncs that were executed on a scrape pool.", + "fill": 1, + "id": 6, + "legend": { + "avg": false, + "current": false, + "max": false, + "min": false, + "show": true, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 1, + "links": [], + "nullPointMode": "null", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [], + "spaceLength": 10, + "span": 6, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "sum(prometheus_target_scrape_pool_sync_total{job=~\"$job\",instance=~\"$instance\"}) by (scrape_job)", + "format": "time_series", + "hide": false, + "intervalFactor": 2, + "legendFormat": "{{scrape_job}}", + "refId": "B", + "step": 4 + } + ], + "thresholds": [], + "timeFrom": null, + "timeShift": null, + "title": "Scrape Sync Total", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "format": "short", + "label": "Syncs", + "logBase": 1, + "max": null, + "min": "0", + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + } + ] + }, + { + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "${DS_PROMETHEUS}", + "description": "Actual interval to sync the scrape pool.", + "fill": 1, + "id": 21, + "legend": { + "avg": false, + "current": false, + "max": false, + "min": false, + "show": true, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 1, + "links": [], + "nullPointMode": "null", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [], + "spaceLength": 10, + "span": 6, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "sum(rate(prometheus_target_sync_length_seconds_sum{job=~\"$job\",instance=~\"$instance\"}[2m])) by (scrape_job) * 1000", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "{{scrape_job}}", + "refId": "A", + "step": 4 + } + ], + "thresholds": [], + "timeFrom": null, + "timeShift": null, + "title": "Target Sync", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "format": "short", + "label": "Milliseconds", + "logBase": 1, + "max": null, + "min": "0", + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + } + ] + } + ], + "repeat": null, + "repeatIteration": null, + "repeatRowId": null, + "showTitle": false, + "title": "sync", + "titleSize": "h6" + }, + { + "collapse": false, + "height": 250, + "panels": [ + { + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "${DS_PROMETHEUS}", + "fill": 1, + "id": 29, + "legend": { + "avg": false, + "current": false, + "max": false, + "min": false, + "show": true, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 1, + "links": [], + "nullPointMode": "null", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [], + "spaceLength": 10, + "span": 6, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "scrape_duration_seconds{instance=~\"$instance\"}", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "{{instance}}", + "refId": "A", + "step": 4 + } + ], + "thresholds": [], + "timeFrom": null, + "timeShift": null, + "title": "Scrape Duration", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "format": "short", + "label": "Seconds", + "logBase": 1, + "max": null, + "min": null, + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + } + ] + }, + { + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "${DS_PROMETHEUS}", + "description": "Total number of rejected scrapes", + "fill": 1, + "id": 30, + "legend": { + "avg": false, + "current": false, + "max": false, + "min": false, + "show": true, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 1, + "links": [], + "nullPointMode": "null", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [], + "spaceLength": 10, + "span": 6, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "sum(prometheus_target_scrapes_exceeded_sample_limit_total{job=~\"$job\",instance=~\"$instance\"})", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "exceeded sample limit", + "refId": "A", + "step": 4 + }, + { + "expr": "sum(prometheus_target_scrapes_sample_duplicate_timestamp_total{job=~\"$job\",instance=~\"$instance\"})", + "format": "time_series", + "hide": false, + "intervalFactor": 2, + "legendFormat": "duplicate timestamp", + "refId": "B", + "step": 4 + }, + { + "expr": "sum(prometheus_target_scrapes_sample_out_of_bounds_total{job=~\"$job\",instance=~\"$instance\"})", + "format": "time_series", + "hide": false, + "intervalFactor": 2, + "legendFormat": "out of bounds", + "refId": "C", + "step": 4 + }, + { + "expr": "sum(prometheus_target_scrapes_sample_out_of_order_total{job=~\"$job\",instance=~\"$instance\"}) ", + "format": "time_series", + "hide": false, + "intervalFactor": 2, + "legendFormat": "out of order", + "refId": "D", + "step": 4 + } + ], + "thresholds": [], + "timeFrom": null, + "timeShift": null, + "title": "Rejected Scrapes", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "decimals": 0, + "format": "short", + "label": "Scrapes", + "logBase": 1, + "max": null, + "min": "0", + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + } + ] + } + ], + "repeat": null, + "repeatIteration": null, + "repeatRowId": null, + "showTitle": false, + "title": "scrapes", + "titleSize": "h6" + }, + { + "collapse": false, + "height": 250, + "panels": [ + { + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "${DS_PROMETHEUS}", + "description": "The duration of rule group evaluations", + "fill": 1, + "id": 10, + "legend": { + "avg": false, + "current": false, + "max": false, + "min": false, + "show": true, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 1, + "links": [], + "nullPointMode": "null", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [], + "spaceLength": 10, + "span": 6, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "1000 * rate(prometheus_evaluator_duration_seconds_sum{job=~\"$job\", instance=~\"$instance\"}[5m]) / rate(prometheus_evaluator_duration_seconds_count{job=~\"$job\", instance=~\"$instance\"}[5m])", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "{{instance}}", + "refId": "E", + "step": 4 + } + ], + "thresholds": [], + "timeFrom": null, + "timeShift": null, + "title": "Average Rule Evaluation Duration", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "format": "short", + "label": "Milliseconds", + "logBase": 1, + "max": null, + "min": "0", + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + } + ] + }, + { + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "${DS_PROMETHEUS}", + "fill": 1, + "id": 11, + "legend": { + "avg": false, + "current": false, + "max": false, + "min": false, + "show": true, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 1, + "links": [], + "nullPointMode": "null", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [], + "spaceLength": 10, + "span": 6, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "sum(rate(http_request_duration_microseconds_count{job=~\"$job\",instance=~\"$instance\"}[1m])) by (handler) > 0", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "{{handler}}", + "refId": "A", + "step": 4 + } + ], + "thresholds": [], + "timeFrom": null, + "timeShift": null, + "title": "HTTP Request Duration", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "format": "short", + "label": "Microseconds", + "logBase": 1, + "max": null, + "min": null, + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + } + ] + }, + { + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "${DS_PROMETHEUS}", + "fill": 1, + "id": 15, + "legend": { + "avg": false, + "current": false, + "max": false, + "min": false, + "show": true, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 1, + "links": [], + "nullPointMode": "null", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [], + "spaceLength": 10, + "span": 6, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "sum(prometheus_engine_query_duration_seconds_sum{job=~\"$job\",instance=~\"$instance\"}) by (slice)", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "{{slice}}", + "refId": "A", + "step": 4 + } + ], + "thresholds": [], + "timeFrom": null, + "timeShift": null, + "title": "Prometheus Engine Query Duration Seconds", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "format": "short", + "label": "Seconds", + "logBase": 1, + "max": null, + "min": "0", + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + } + ] + }, + { + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "${DS_PROMETHEUS}", + "description": "Rule-group evaluations \n - total\n - missed due to slow rule group evaluation\n - skipped due to throttled metric storage", + "fill": 1, + "id": 31, + "legend": { + "avg": false, + "current": false, + "max": false, + "min": false, + "show": true, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 1, + "links": [], + "nullPointMode": "null", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [], + "spaceLength": 10, + "span": 6, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "sum(rate(prometheus_evaluator_iterations_total{job=~\"$job\", instance=~\"$instance\"}[5m]))", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "Total", + "refId": "B", + "step": 4 + }, + { + "expr": "sum(rate(prometheus_evaluator_iterations_missed_total{job=~\"$job\", instance=~\"$instance\"}[5m]))", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "Missed", + "refId": "A", + "step": 4 + }, + { + "expr": "sum(rate(prometheus_evaluator_iterations_skipped_total{job=~\"$job\", instance=~\"$instance\"}[5m]))", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "Skipped", + "refId": "C", + "step": 4 + } + ], + "thresholds": [], + "timeFrom": null, + "timeShift": null, + "title": "Rule Evaluator Iterations", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "format": "short", + "label": "iterations", + "logBase": 1, + "max": null, + "min": "0", + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + } + ] + } + ], + "repeat": null, + "repeatIteration": null, + "repeatRowId": null, + "showTitle": false, + "title": "durations", + "titleSize": "h6" + }, + { + "collapse": false, + "height": 250, + "panels": [ + { + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "${DS_PROMETHEUS}", + "fill": 1, + "id": 22, + "legend": { + "avg": false, + "current": false, + "max": false, + "min": false, + "show": true, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 1, + "links": [], + "nullPointMode": "null", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [], + "spaceLength": 10, + "span": 12, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "rate(prometheus_notifications_sent_total[5m])", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "{{instance}}", + "refId": "A", + "step": 2 + } + ], + "thresholds": [], + "timeFrom": null, + "timeShift": null, + "title": "Notifications Sent", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "format": "short", + "label": "Notifications", + "logBase": 1, + "max": null, + "min": "0", + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + } + ] + } + ], + "repeat": null, + "repeatIteration": null, + "repeatRowId": null, + "showTitle": false, + "title": "notifications", + "titleSize": "h6" + }, + { + "collapse": false, + "height": 250, + "panels": [ + { + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "${DS_PROMETHEUS}", + "fill": 1, + "id": 23, + "legend": { + "avg": false, + "current": false, + "max": false, + "min": false, + "show": true, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 1, + "links": [], + "nullPointMode": "null", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [], + "spaceLength": 10, + "span": 6, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "(time() - prometheus_config_last_reload_success_timestamp_seconds{job=~\"$job\",instance=~\"$instance\"}) / 60", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "{{instance}}", + "refId": "A", + "step": 4 + } + ], + "thresholds": [], + "timeFrom": null, + "timeShift": null, + "title": "Minutes Since Successful Config Reload", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "format": "short", + "label": "Minutes", + "logBase": 1, + "max": null, + "min": "0", + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + } + ] + }, + { + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "${DS_PROMETHEUS}", + "fill": 1, + "id": 24, + "legend": { + "avg": false, + "current": false, + "max": false, + "min": false, + "show": true, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 1, + "links": [], + "nullPointMode": "null", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [], + "spaceLength": 10, + "span": 6, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "prometheus_config_last_reload_successful{job=~\"$job\",instance=~\"$instance\"}", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "{{instance}}", + "refId": "A", + "step": 4 + } + ], + "thresholds": [], + "timeFrom": null, + "timeShift": null, + "title": "Successful Config Reload", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "decimals": 0, + "format": "short", + "label": "Success", + "logBase": 1, + "max": "1", + "min": "0", + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + } + ] + } + ], + "repeat": null, + "repeatIteration": null, + "repeatRowId": null, + "showTitle": false, + "title": "config", + "titleSize": "h6" + }, + { + "collapse": false, + "height": 250, + "panels": [ + { + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "${DS_PROMETHEUS}", + "description": "GC invocation durations", + "fill": 1, + "id": 28, + "legend": { + "avg": false, + "current": false, + "max": false, + "min": false, + "show": true, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 1, + "links": [], + "nullPointMode": "null", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [], + "spaceLength": 10, + "span": 12, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "sum(rate(go_gc_duration_seconds_sum{instance=~\"$instance\",job=~\"$job\"}[2m])) by (instance)", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "{{instance}}", + "refId": "A", + "step": 2 + } + ], + "thresholds": [], + "timeFrom": null, + "timeShift": null, + "title": "GC Rate / 2m", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + } + ] + } + ], + "repeat": null, + "repeatIteration": null, + "repeatRowId": null, + "showTitle": false, + "title": "garbage collection", + "titleSize": "h6" + }, + { + "collapse": true, + "height": 250, + "panels": [ + { + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "${DS_PROMETHEUS}", + "description": "This is probably wrong! Please help.", + "fill": 1, + "id": 26, + "legend": { + "avg": false, + "current": false, + "max": false, + "min": false, + "show": true, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 1, + "links": [], + "nullPointMode": "null", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [ + { + "alias": "allocated", + "stack": false + } + ], + "spaceLength": 10, + "span": 6, + "stack": true, + "steppedLine": false, + "targets": [ + { + "expr": "sum(go_memstats_alloc_bytes_total{job=~\"$job\", instance=~\"$instance\"})", + "format": "time_series", + "hide": true, + "intervalFactor": 2, + "legendFormat": "alloc_bytes_total", + "refId": "A", + "step": 10 + }, + { + "expr": "sum(go_memstats_alloc_bytes{job=~\"$job\", instance=~\"$instance\"})", + "format": "time_series", + "hide": false, + "intervalFactor": 2, + "legendFormat": "allocated", + "refId": "B", + "step": 10 + }, + { + "expr": "sum(go_memstats_buck_hash_sys_bytes{job=~\"$job\", instance=~\"$instance\"})", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "profiling bucket hash table", + "refId": "C", + "step": 10 + }, + { + "expr": "sum(go_memstats_gc_sys_bytes{job=~\"$job\", instance=~\"$instance\"})", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "GC metadata", + "refId": "D", + "step": 10 + }, + { + "expr": "sum(go_memstats_heap_alloc_bytes{job=~\"$job\", instance=~\"$instance\"})", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "heap in-use", + "refId": "E", + "step": 10 + }, + { + "expr": "sum(go_memstats_heap_idle_bytes{job=~\"$job\", instance=~\"$instance\"})", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "heap idle", + "refId": "F", + "step": 10 + }, + { + "expr": "sum(go_memstats_heap_inuse_bytes{job=~\"$job\", instance=~\"$instance\"})", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "heap in use", + "refId": "G", + "step": 10 + }, + { + "expr": "sum(go_memstats_heap_released_bytes{job=~\"$job\", instance=~\"$instance\"})", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "heap released", + "refId": "H", + "step": 10 + }, + { + "expr": "sum(go_memstats_heap_sys_bytes{job=~\"$job\", instance=~\"$instance\"})", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "heap system", + "refId": "I", + "step": 10 + }, + { + "expr": "sum(go_memstats_mcache_inuse_bytes{job=~\"$job\", instance=~\"$instance\"})", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "mcache in use", + "refId": "J", + "step": 10 + }, + { + "expr": "sum(go_memstats_mcache_sys_bytes{job=~\"$job\", instance=~\"$instance\"})", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "mcache sys", + "refId": "K", + "step": 10 + }, + { + "expr": "sum(go_memstats_mspan_inuse_bytes{job=~\"$job\", instance=~\"$instance\"})", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "mspan in use", + "refId": "L", + "step": 10 + }, + { + "expr": "sum(go_memstats_mspan_sys_bytes{job=~\"$job\", instance=~\"$instance\"})", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "mspan sys", + "refId": "M", + "step": 10 + }, + { + "expr": "sum(go_memstats_next_gc_bytes{job=~\"$job\", instance=~\"$instance\"})", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "heap next gc", + "refId": "N", + "step": 10 + }, + { + "expr": "sum(go_memstats_other_sys_bytes{job=~\"$job\", instance=~\"$instance\"})", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "other sys", + "refId": "O", + "step": 10 + }, + { + "expr": "sum(go_memstats_stack_inuse_bytes{job=~\"$job\", instance=~\"$instance\"})", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "stack in use", + "refId": "P", + "step": 10 + }, + { + "expr": "sum(go_memstats_stack_sys_bytes{job=~\"$job\", instance=~\"$instance\"})", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "stack sys", + "refId": "Q", + "step": 10 + }, + { + "expr": "sum(go_memstats_sys_bytes{job=~\"$job\", instance=~\"$instance\"})", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "sys", + "refId": "R", + "step": 10 + } + ], + "thresholds": [], + "timeFrom": null, + "timeShift": null, + "title": "Go Memory Usage (FIXME)", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "format": "bytes", + "label": null, + "logBase": 1, + "max": null, + "min": "0", + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + } + ] + }, + { + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "${DS_PROMETHEUS}", + "fill": 1, + "id": 9, + "legend": { + "avg": false, + "current": false, + "max": false, + "min": false, + "show": true, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 1, + "links": [], + "nullPointMode": "null", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [], + "spaceLength": 10, + "span": 3, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "prometheus_target_interval_length_seconds{instance=~\"$instance\", job=~\"$job\"}", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "{{quantile}} {{interval}}", + "refId": "A", + "step": 20 + } + ], + "thresholds": [], + "timeFrom": null, + "timeShift": null, + "title": "Scrape Duration", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "format": "short", + "label": "Seconds", + "logBase": 1, + "max": null, + "min": "0", + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + } + ] + }, + { + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "${DS_PROMETHEUS}", + "fill": 1, + "id": 7, + "legend": { + "avg": false, + "current": false, + "max": false, + "min": false, + "show": true, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 1, + "links": [], + "nullPointMode": "null", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [], + "spaceLength": 10, + "span": 3, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "sum(rate(prometheus_target_interval_length_seconds_count{job=~\"$job\",instance=~\"$instance\"}[5m])) by (interval)", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "{{interval}}", + "refId": "A", + "step": 20 + } + ], + "thresholds": [], + "timeFrom": null, + "timeShift": null, + "title": "Target Scrapes / 5m", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "format": "short", + "label": "Scrapes", + "logBase": 1, + "max": null, + "min": "0", + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + } + ] + } + ], + "repeat": null, + "repeatIteration": null, + "repeatRowId": null, + "showTitle": false, + "title": "Broken, ignore", + "titleSize": "h6" + } + ], + "schemaVersion": 14, + "style": "dark", + "tags": [], + "templating": { + "list": [ + { + "current": { + "selected": false, + "text": "default", + "value": "default" + }, + "hide": 0, + "includeAll": false, + "label": "datasource", + "multi": false, + "name": "DS_PROMETHEUS", + "options": [], + "query": "prometheus", + "refresh": 1, + "regex": "", + "skipUrlSync": false, + "type": "datasource" + }, + { + "allValue": null, + "current": {}, + "datasource": "${DS_PROMETHEUS}", + "hide": 0, + "includeAll": true, + "label": null, + "multi": true, + "name": "job", + "options": [], + "query": "query_result(prometheus_tsdb_head_samples_appended_total)", + "refresh": 2, + "regex": "/.*job=\"([^\"]+)/", + "sort": 1, + "tagValuesQuery": "", + "tags": [], + "tagsQuery": "", + "type": "query", + "useTags": false + }, + { + "allValue": null, + "current": {}, + "datasource": "${DS_PROMETHEUS}", + "hide": 0, + "includeAll": true, + "label": null, + "multi": true, + "name": "instance", + "options": [], + "query": "query_result(up{job=~\"$job\"})", + "refresh": 2, + "regex": "/.*instance=\"([^\"]+).*/", + "sort": 0, + "tagValuesQuery": "", + "tags": [], + "tagsQuery": "", + "type": "query", + "useTags": false + }, + { + "allValue": null, + "current": { + "selected": true, + "text": "1h", + "value": "1h" + }, + "hide": 0, + "includeAll": false, + "label": null, + "multi": false, + "name": "interval", + "options": [ + { + "selected": true, + "text": "1h", + "value": "1h" + }, + { + "selected": false, + "text": "3h", + "value": "3h" + }, + { + "selected": false, + "text": "6h", + "value": "6h" + }, + { + "selected": false, + "text": "12h", + "value": "12h" + }, + { + "selected": false, + "text": "1d", + "value": "1d" + }, + { + "selected": false, + "text": "2d", + "value": "2d" + }, + { + "selected": false, + "text": "7d", + "value": "7d" + }, + { + "selected": false, + "text": "30d", + "value": "30d" + }, + { + "selected": false, + "text": "90d", + "value": "90d" + }, + { + "selected": false, + "text": "180d", + "value": "180d" + } + ], + "query": "1h, 3h, 6h, 12h, 1d, 2d, 7d, 30d, 90d, 180d", + "type": "custom" + } + ] + }, + "time": { + "from": "now-4h", + "to": "now" + }, + "timepicker": { + "refresh_intervals": [ + "5s", + "10s", + "30s", + "1m", + "5m", + "15m", + "30m", + "1h", + "2h", + "1d" + ], + "time_options": [ + "5m", + "15m", + "1h", + "6h", + "12h", + "24h", + "2d", + "7d", + "30d" + ] + }, + "timezone": "browser", + "title": "Prometheus", + "version": 1 +} diff --git a/terraform-ci-infra/1n_nmd/grafana/main.tf b/terraform-ci-infra/1n_nmd/grafana/main.tf new file mode 100644 index 0000000000..b67ba03985 --- /dev/null +++ b/terraform-ci-infra/1n_nmd/grafana/main.tf @@ -0,0 +1,24 @@ +locals { + datacenters = join(",", var.nomad_datacenters) +} + +data "template_file" "nomad_job_grafana" { + template = file("${path.module}/conf/nomad/grafana.hcl") + vars = { + datacenters = local.datacenters + job_name = var.grafana_job_name + use_canary = var.grafana_use_canary + group_count = var.grafana_group_count + service_name = var.grafana_service_name + use_vault_provider = var.grafana_vault_secret.use_vault_provider + image = var.grafana_container_image + cpu = var.grafana_cpu + mem = var.grafana_mem + port = var.grafana_port + } +} + +resource "nomad_job" "nomad_job_grafana" { + jobspec = data.template_file.nomad_job_grafana.rendered + detach = false +} \ No newline at end of file diff --git a/terraform-ci-infra/1n_nmd/grafana/variables.tf b/terraform-ci-infra/1n_nmd/grafana/variables.tf new file mode 100644 index 0000000000..0c2382b16a --- /dev/null +++ b/terraform-ci-infra/1n_nmd/grafana/variables.tf @@ -0,0 +1,66 @@ +# Nomad +variable "nomad_datacenters" { + description = "Nomad data centers" + type = list(string) + default = [ "dc1" ] +} + +# Grafana +variable "grafana_job_name" { + description = "Grafana job name" + type = string + default = "grafana" +} + +variable "grafana_group_count" { + description = "Number of grafana group instances" + type = number + default = 1 +} + +variable "grafana_service_name" { + description = "Grafana service name" + type = string + default = "grafana" +} + +variable "grafana_container_image" { + description = "Grafana docker image" + type = string + default = "grafana/grafana:7.3.7" +} + +variable "grafana_use_canary" { + description = "Uses canary deployment" + type = bool + default = false +} + +variable "grafana_vault_secret" { + description = "Set of properties to be able to fetch secret from vault" + type = object({ + use_vault_provider = bool, + vault_kv_policy_name = string, + vault_kv_path = string, + vault_kv_field_access_key = string, + vault_kv_field_secret_key = string + }) +} + +variable "grafana_cpu" { + description = "Grafana CPU allocation" + type = number + default = 2000 +} + +variable "grafana_mem" { + description = "Grafana RAM allocation" + type = number + default = 8192 +} + +variable "grafana_port" { + description = "Grafana TCP allocation" + type = number + default = 3000 +} \ No newline at end of file diff --git a/terraform-ci-infra/1n_nmd/main.tf b/terraform-ci-infra/1n_nmd/main.tf index 4da8929e8d..4e0b7537c4 100644 --- a/terraform-ci-infra/1n_nmd/main.tf +++ b/terraform-ci-infra/1n_nmd/main.tf @@ -4,64 +4,182 @@ # and downstream modules can simply declare resources for that provider # and have them automatically associated with the root provider # configurations. +module "alertmanager" { + source = "./alertmanager" + providers = { + nomad = nomad.yul1 + } + + # nomad + nomad_datacenters = [ "yul1" ] + + # alertmanager + alertmanager_job_name = "prod-alertmanager" + alertmanager_use_canary = true + alertmanager_group_count = 1 + alertmanager_vault_secret = { + use_vault_provider = false, + vault_kv_policy_name = "kv-secret", + vault_kv_path = "secret/data/prometheus", + vault_kv_field_access_key = "access_key", + vault_kv_field_secret_key = "secret_key" + } + alertmanager_version = "0.21.0" + alertmanager_cpu = 1000 + alertmanager_mem = 1024 + alertmanager_port = 9093 + alertmanager_slack_api_url = "https://hooks.slack.com/services/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" + alertmanager_slack_channel = "fdio-infra-monitoring" +} + +module "exporter" { + source = "./exporter" + providers = { + nomad = nomad.yul1 + } + + # nomad + nomad_datacenters = [ "yul1" ] + + # exporter + exporter_job_name = "prod-exporter" + exporter_use_canary = false + + # node + node_version = "1.0.1" + node_port = 9100 + + # blackbox + blackbox_version = "0.18.0" + blackbox_port = 9115 + + # cadvisor + cadvisor_image = "gcr.io/cadvisor/cadvisor:latest" + cadvisor_port = 8080 +} + +module "grafana" { + source = "./grafana" + providers = { + nomad = nomad.yul1 + } + + # nomad + nomad_datacenters = [ "yul1" ] + + # grafana + grafana_job_name = "prod-grafana" + grafana_use_canary = true + grafana_group_count = 1 + grafana_vault_secret = { + use_vault_provider = false, + vault_kv_policy_name = "kv-secret", + vault_kv_path = "secret/data/prometheus", + vault_kv_field_access_key = "access_key", + vault_kv_field_secret_key = "secret_key" + } + grafana_container_image = "grafana/grafana:7.3.7" + grafana_cpu = 2000 + grafana_mem = 2048 + grafana_port = 3000 +} + module "minio" { - source = "./minio" - providers = { + source = "./minio" + providers = { nomad = nomad.yul1 } # nomad - nomad_datacenters = [ "yul1" ] - nomad_host_volume = "prod-volume-data1-1" + nomad_datacenters = [ "yul1" ] + nomad_host_volume = "prod-volume-data1-1" # minio - minio_job_name = "prod-minio" - minio_group_count = 4 - minio_service_name = "storage" - minio_host = "http://10.32.8.1{4...7}" - minio_port = 9000 - minio_container_image = "minio/minio:RELEASE.2020-12-03T05-49-24Z" - minio_vault_secret = { - use_vault_provider = false, - vault_kv_policy_name = "kv-secret", - vault_kv_path = "secret/data/minio", - vault_kv_field_access_key = "access_key", - vault_kv_field_secret_key = "secret_key" + minio_job_name = "prod-minio" + minio_group_count = 4 + minio_service_name = "storage" + minio_host = "http://10.32.8.1{4...7}" + minio_port = 9000 + minio_container_image = "minio/minio:RELEASE.2020-12-03T05-49-24Z" + minio_vault_secret = { + use_vault_provider = false, + vault_kv_policy_name = "kv-secret", + vault_kv_path = "secret/data/minio", + vault_kv_field_access_key = "access_key", + vault_kv_field_secret_key = "secret_key" } - minio_data_dir = "/data/" - minio_use_host_volume = true - minio_use_canary = true - minio_envs = [ "MINIO_BROWSER=\"off\"" ] + minio_data_dir = "/data/" + minio_use_host_volume = true + minio_use_canary = true + minio_envs = [ "MINIO_BROWSER=\"off\"" ] # minio client - mc_job_name = "prod-mc" - mc_container_image = "minio/mc:RELEASE.2020-12-10T01-26-17Z" - mc_extra_commands = [ + mc_job_name = "prod-mc" + mc_container_image = "minio/mc:RELEASE.2020-12-10T01-26-17Z" + mc_extra_commands = [ "mc policy set public LOCALMINIO/logs.fd.io", "mc policy set public LOCALMINIO/docs.fd.io", "mc ilm add --expiry-days '180' LOCALMINIO/logs.fd.io", "mc admin user add LOCALMINIO storage Storage1234", "mc admin policy set LOCALMINIO writeonly user=storage" ] - minio_buckets = [ "logs.fd.io", "docs.fd.io" ] + minio_buckets = [ "logs.fd.io", "docs.fd.io" ] } module "nginx" { - source = "./nginx" - providers = { + source = "./nginx" + providers = { nomad = nomad.yul1 } # nomad - nomad_datacenters = [ "yul1" ] + nomad_datacenters = [ "yul1" ] # nginx - nginx_job_name = "prod-nginx" + nginx_job_name = "prod-nginx" } -#module "vpp_device" { -# source = "./vpp_device" -# providers = { -# nomad = nomad.yul1 -# } -#} \ No newline at end of file +module "prometheus" { + source = "./prometheus" + providers = { + nomad = nomad.yul1 + } + + # nomad + nomad_datacenters = [ "yul1" ] + nomad_host_volume = "prod-volume-data1-1" + + # prometheus + prometheus_job_name = "prod-prometheus" + prometheus_use_canary = true + prometheus_group_count = 4 + prometheus_vault_secret = { + use_vault_provider = false, + vault_kv_policy_name = "kv-secret", + vault_kv_path = "secret/data/prometheus", + vault_kv_field_access_key = "access_key", + vault_kv_field_secret_key = "secret_key" + } + prometheus_data_dir = "/data/" + prometheus_use_host_volume = true + prometheus_version = "2.24.0" + prometheus_cpu = 2000 + prometheus_mem = 8192 + prometheus_port = 9090 +} + +module "vpp_device" { + source = "./vpp_device" + providers = { + nomad = nomad.yul1 + } + + # nomad + nomad_datacenters = [ "yul1" ] + + # csit_shim + csit_shim_job_name = "prod-device-csit-shim" + csit_shim_group_count = "1" + csit_shim_cpu = "1000" + csit_shim_mem = "5000" +} \ No newline at end of file diff --git a/terraform-ci-infra/1n_nmd/prometheus/conf/nomad/prometheus.hcl b/terraform-ci-infra/1n_nmd/prometheus/conf/nomad/prometheus.hcl new file mode 100644 index 0000000000..2d74662f31 --- /dev/null +++ b/terraform-ci-infra/1n_nmd/prometheus/conf/nomad/prometheus.hcl @@ -0,0 +1,569 @@ +job "${job_name}" { + # The "region" parameter specifies the region in which to execute the job. + # If omitted, this inherits the default region name of "global". + # region = "global" + # + # The "datacenters" parameter specifies the list of datacenters which should + # be considered when placing this task. This must be provided. + datacenters = "${datacenters}" + + # The "type" parameter controls the type of job, which impacts the scheduler's + # decision on placement. This configuration is optional and defaults to + # "service". For a full list of job types and their differences, please see + # the online documentation. + # + # For more information, please see the online documentation at: + # + # https://www.nomadproject.io/docs/jobspec/schedulers + # + type = "service" + + update { + # The "max_parallel" parameter specifies the maximum number of updates to + # perform in parallel. In this case, this specifies to update a single task + # at a time. + max_parallel = 1 + + health_check = "checks" + + # The "min_healthy_time" parameter specifies the minimum time the allocation + # must be in the healthy state before it is marked as healthy and unblocks + # further allocations from being updated. + min_healthy_time = "10s" + + # The "healthy_deadline" parameter specifies the deadline in which the + # allocation must be marked as healthy after which the allocation is + # automatically transitioned to unhealthy. Transitioning to unhealthy will + # fail the deployment and potentially roll back the job if "auto_revert" is + # set to true. + healthy_deadline = "3m" + + # The "progress_deadline" parameter specifies the deadline in which an + # allocation must be marked as healthy. The deadline begins when the first + # allocation for the deployment is created and is reset whenever an allocation + # as part of the deployment transitions to a healthy state. If no allocation + # transitions to the healthy state before the progress deadline, the + # deployment is marked as failed. + progress_deadline = "10m" + +%{ if use_canary } + # The "canary" parameter specifies that changes to the job that would result + # in destructive updates should create the specified number of canaries + # without stopping any previous allocations. Once the operator determines the + # canaries are healthy, they can be promoted which unblocks a rolling update + # of the remaining allocations at a rate of "max_parallel". + # + # Further, setting "canary" equal to the count of the task group allows + # blue/green deployments. When the job is updated, a full set of the new + # version is deployed and upon promotion the old version is stopped. + canary = 1 + + # Specifies if the job should auto-promote to the canary version when all + # canaries become healthy during a deployment. Defaults to false which means + # canaries must be manually updated with the nomad deployment promote + # command. + auto_promote = true + + # The "auto_revert" parameter specifies if the job should auto-revert to the + # last stable job on deployment failure. A job is marked as stable if all the + # allocations as part of its deployment were marked healthy. + auto_revert = true +%{ endif } + } + + # The "group" stanza defines a series of tasks that should be co-located on + # the same Nomad client. Any task within a group will be placed on the same + # client. + # + # For more information and examples on the "group" stanza, please see + # the online documentation at: + # + # https://www.nomadproject.io/docs/job-specification/group + # + group "prod-group1-${service_name}" { + # The "count" parameter specifies the number of the task groups that should + # be running under this group. This value must be non-negative and defaults + # to 1. + count = ${group_count} + + # The volume stanza allows the group to specify that it requires a given + # volume from the cluster. + # + # For more information and examples on the "volume" stanza, please see + # the online documentation at: + # + # https://www.nomadproject.io/docs/job-specification/volume + # + %{ if use_host_volume } + volume "prod-volume1-${service_name}" { + type = "host" + read_only = false + source = "${host_volume}" + } + %{ endif } + + # The constraint allows restricting the set of eligible nodes. Constraints + # may filter on attributes or client metadata. + # + # For more information and examples on the "volume" stanza, please see + # the online documentation at: + # + # https://www.nomadproject.io/docs/job-specification/constraint + # + constraint { + attribute = "$${attr.cpu.arch}" + operator = "!=" + value = "arm64" + } + + # The "task" stanza creates an individual unit of work, such as a Docker + # container, web application, or batch processing. + # + # For more information and examples on the "task" stanza, please see + # the online documentation at: + # + # https://www.nomadproject.io/docs/job-specification/task + # + task "prod-task1-${service_name}" { + # The "driver" parameter specifies the task driver that should be used to + # run the task. + driver = "exec" + + %{ if use_host_volume } + volume_mount { + volume = "prod-volume1-${service_name}" + destination = "${data_dir}" + read_only = false + } + %{ endif } + + %{ if use_vault_provider } + vault { + policies = "${vault_kv_policy_name}" + } + %{ endif } + + # The "config" stanza specifies the driver configuration, which is passed + # directly to the driver to start the task. The details of configurations + # are specific to each driver, so please see specific driver + # documentation for more information. + config { + command = "local/prometheus-${version}.linux-amd64/prometheus" + args = [ + "--config.file=secrets/prometheus.yml", + "--storage.tsdb.path=${data_dir}prometheus/", + "--storage.tsdb.retention.time=15d" + ] + } + + # The artifact stanza instructs Nomad to fetch and unpack a remote resource, + # such as a file, tarball, or binary. Nomad downloads artifacts using the + # popular go-getter library, which permits downloading artifacts from a + # variety of locations using a URL as the input source. + # + # For more information and examples on the "artifact" stanza, please see + # the online documentation at: + # + # https://www.nomadproject.io/docs/job-specification/artifact + # + artifact { + source = "${url}" + } + + # The "template" stanza instructs Nomad to manage a template, such as + # a configuration file or script. This template can optionally pull data + # from Consul or Vault to populate runtime configuration data. + # + # For more information and examples on the "template" stanza, please see + # the online documentation at: + # + # https://www.nomadproject.io/docs/job-specification/template + # + template { + change_mode = "noop" + change_signal = "SIGINT" + destination = "secrets/alerts.yml" + left_delimiter = "{{{" + right_delimiter = "}}}" + data = < 80 + for: 0m + labels: + severity: warning + annotations: + summary: "Host high CPU load (instance {{ $labels.instance }})." + description: "CPU load is > 80%." + - alert: HostOutOfMemory + expr: node_memory_MemAvailable_bytes / node_memory_MemTotal_bytes * 100 < 10 + for: 2m + labels: + severity: warning + annotations: + summary: "Host out of memory (instance {{ $labels.instance }})." + description: "Node memory is filling up (< 10% left)." + - alert: HostOomKillDetected + expr: increase(node_vmstat_oom_kill[1m]) > 0 + for: 0m + labels: + severity: warning + annotations: + summary: "Host OOM kill detected (instance {{ $labels.instance }})." + description: "OOM kill detected." + - alert: HostMemoryUnderMemoryPressure + expr: rate(node_vmstat_pgmajfault[1m]) > 1000 + for: 2m + labels: + severity: warning + annotations: + summary: "Host memory under memory pressure (instance {{ $labels.instance }})." + description: "The node is under heavy memory pressure. High rate of major page faults." + - alert: HostOutOfDiskSpace + expr: (node_filesystem_avail_bytes * 100) / node_filesystem_size_bytes < 10 and ON (instance, device, mountpoint) node_filesystem_readonly == 0 + for: 2m + labels: + severity: warning + annotations: + summary: "Host out of disk space (instance {{ $labels.instance }})." + description: "Disk is almost full (< 10% left)." + - alert: HostRaidDiskFailure + expr: node_md_disks{state="failed"} > 0 + for: 2m + labels: + severity: warning + annotations: + summary: "Host RAID disk failure (instance {{ $labels.instance }})." + description: "At least one device in RAID array on {{ $labels.instance }} failed. Array {{ $labels.md_device }} needs attention and possibly a disk swap." + - alert: HostConntrackLimit + expr: node_nf_conntrack_entries / node_nf_conntrack_entries_limit > 0.8 + for: 5m + labels: + severity: warning + annotations: + summary: "Host conntrack limit (instance {{ $labels.instance }})." + description: "The number of conntrack is approching limit." + - alert: HostNetworkInterfaceSaturated + expr: (rate(node_network_receive_bytes_total{device!~"^tap.*"}[1m]) + rate(node_network_transmit_bytes_total{device!~"^tap.*"}[1m])) / node_network_speed_bytes{device!~"^tap.*"} > 0.8 + for: 1m + labels: + severity: warning + annotations: + summary: "Host Network Interface Saturated (instance {{ $labels.instance }})." + description: "The network interface {{ $labels.interface }} on {{ $labels.instance }} is getting overloaded." + - alert: HostSystemdServiceCrashed + expr: node_systemd_unit_state{state="failed"} == 1 + for: 0m + labels: + severity: warning + annotations: + summary: "Host SystemD service crashed (instance {{ $labels.instance }})." + description: "SystemD service crashed." + - alert: HostEdacCorrectableErrorsDetected + expr: increase(node_edac_correctable_errors_total[1m]) > 0 + for: 0m + labels: + severity: info + annotations: + summary: "Host EDAC Correctable Errors detected (instance {{ $labels.instance }})." + description: '{{ $labels.instance }} has had {{ printf "%.0f" $value }} correctable memory errors reported by EDAC in the last 5 minutes.' + - alert: HostEdacUncorrectableErrorsDetected + expr: node_edac_uncorrectable_errors_total > 0 + for: 0m + labels: + severity: warning + annotations: + summary: "Host EDAC Uncorrectable Errors detected (instance {{ $labels.instance }})." + description: '{{ $labels.instance }} has had {{ printf "%.0f" $value }} uncorrectable memory errors reported by EDAC in the last 5 minutes.' +- name: "Min.io" + rules: + - alert: MinioDiskOffline + expr: minio_offline_disks > 0 + for: 0m + labels: + severity: critical + annotations: + summary: "Minio disk offline (instance {{ $labels.instance }})" + description: "Minio disk is offline." + - alert: MinioStorageSpaceExhausted + expr: minio_disk_storage_free_bytes / 1024 / 1024 / 1024 < 10 + for: 2m + labels: + severity: warning + annotations: + summary: "Minio storage space exhausted (instance {{ $labels.instance }})." + description: "Minio storage space is low (< 10 GB)." +- name: "Prometheus" + rules: + - alert: PrometheusConfigurationReloadFailure + expr: prometheus_config_last_reload_successful != 1 + for: 0m + labels: + severity: warning + annotations: + summary: "Prometheus configuration reload failure (instance {{ $labels.instance }})." + description: "Prometheus configuration reload error." + - alert: PrometheusTooManyRestarts + expr: changes(process_start_time_seconds{job=~"prometheus|pushgateway|alertmanager"}[15m]) > 2 + for: 0m + labels: + severity: warning + annotations: + summary: "Prometheus too many restarts (instance {{ $labels.instance }})." + description: "Prometheus has restarted more than twice in the last 15 minutes. It might be crashlooping." + - alert: PrometheusAlertmanagerConfigurationReloadFailure + expr: alertmanager_config_last_reload_successful != 1 + for: 0m + labels: + severity: warning + annotations: + summary: "Prometheus AlertManager configuration reload failure (instance {{ $labels.instance }})." + description: "AlertManager configuration reload error." + - alert: PrometheusRuleEvaluationFailures + expr: increase(prometheus_rule_evaluation_failures_total[3m]) > 0 + for: 0m + labels: + severity: critical + annotations: + summary: "Prometheus rule evaluation failures (instance {{ $labels.instance }})." + description: "Prometheus encountered {{ $value }} rule evaluation failures, leading to potentially ignored alerts." + - alert: PrometheusTargetScrapingSlow + expr: prometheus_target_interval_length_seconds{quantile="0.9"} > 60 + for: 5m + labels: + severity: warning + annotations: + summary: "Prometheus target scraping slow (instance {{ $labels.instance }})." + description: "Prometheus is scraping exporters slowly." + - alert: PrometheusTsdbCompactionsFailed + expr: increase(prometheus_tsdb_compactions_failed_total[1m]) > 0 + for: 0m + labels: + severity: critical + annotations: + summary: "Prometheus TSDB compactions failed (instance {{ $labels.instance }})." + description: "Prometheus encountered {{ $value }} TSDB compactions failures." + - alert: PrometheusTsdbHeadTruncationsFailed + expr: increase(prometheus_tsdb_head_truncations_failed_total[1m]) > 0 + for: 0m + labels: + severity: critical + annotations: + summary: "Prometheus TSDB head truncations failed (instance {{ $labels.instance }})." + description: "Prometheus encountered {{ $value }} TSDB head truncation failures." + - alert: PrometheusTsdbWalCorruptions + expr: increase(prometheus_tsdb_wal_corruptions_total[1m]) > 0 + for: 0m + labels: + severity: critical + annotations: + summary: "Prometheus TSDB WAL corruptions (instance {{ $labels.instance }})." + description: "Prometheus encountered {{ $value }} TSDB WAL corruptions." + - alert: PrometheusTsdbWalTruncationsFailed + expr: increase(prometheus_tsdb_wal_truncations_failed_total[1m]) > 0 + for: 0m + labels: + severity: critical + annotations: + summary: "Prometheus TSDB WAL truncations failed (instance {{ $labels.instance }})." + description: "Prometheus encountered {{ $value }} TSDB WAL truncation failures." +EOH + } + + template { + change_mode = "noop" + change_signal = "SIGINT" + destination = "secrets/prometheus.yml" + data = <