From 721f62270ea619ce8b9b689c14805dea6b32c8c6 Mon Sep 17 00:00:00 2001 From: Kochetkov S Date: Thu, 18 Jun 2026 12:26:04 +0300 Subject: [PATCH] Fix brusnika observability exporters and alerts --- .../camunda-servicemonitors.yaml | 202 ++++++++++++++++++ .../kafka-exporter-yc-dashboard.yaml | 2 +- .../kafka-exporter-yc-rules.yaml | 18 +- .../infrastructure/kustomization.yaml | 2 + .../patches/postgres-exporter.yaml | 38 +--- .../infrastructure/rabbitmq-exporter.yaml | 90 ++++++++ .../camunda-servicemonitors.yaml | 202 ++++++++++++++++++ .../kafka-exporter-yc-dashboard.yaml | 2 +- .../kafka-exporter-yc-rules.yaml | 18 +- .../infrastructure/kustomization.yaml | 2 + .../patches/postgres-exporter.yaml | 12 +- .../infrastructure/rabbitmq-exporter.yaml | 90 ++++++++ 12 files changed, 620 insertions(+), 58 deletions(-) create mode 100644 clusters/brusnika-prod/infrastructure/camunda-servicemonitors.yaml create mode 100644 clusters/brusnika-prod/infrastructure/rabbitmq-exporter.yaml create mode 100644 clusters/brusnika-stage/infrastructure/camunda-servicemonitors.yaml create mode 100644 clusters/brusnika-stage/infrastructure/rabbitmq-exporter.yaml diff --git a/clusters/brusnika-prod/infrastructure/camunda-servicemonitors.yaml b/clusters/brusnika-prod/infrastructure/camunda-servicemonitors.yaml new file mode 100644 index 0000000..cff5a9f --- /dev/null +++ b/clusters/brusnika-prod/infrastructure/camunda-servicemonitors.yaml @@ -0,0 +1,202 @@ +apiVersion: monitoring.coreos.com/v1 +kind: ServiceMonitor +metadata: + name: camunda-camunda-prod-connectors + namespace: camunda + labels: + app: camunda-platform + app.kubernetes.io/instance: camunda + app.kubernetes.io/name: camunda-prod + app.kubernetes.io/part-of: camunda-platform + release: metrics +spec: + endpoints: + - honorLabels: true + interval: 10s + path: /actuator/prometheus + port: http + relabelings: + - action: replace + targetLabel: cluster + replacement: brusnika-prod + - action: replace + targetLabel: source_cluster + replacement: brusnika-prod + selector: + matchLabels: + app: camunda-platform + app.kubernetes.io/component: connectors +--- +apiVersion: monitoring.coreos.com/v1 +kind: ServiceMonitor +metadata: + name: camunda-camunda-prod-identity + namespace: camunda + labels: + app: camunda-platform + app.kubernetes.io/instance: camunda + app.kubernetes.io/name: camunda-prod + app.kubernetes.io/part-of: camunda-platform + release: metrics +spec: + endpoints: + - honorLabels: true + interval: 10s + path: /actuator/prometheus + port: metrics + relabelings: + - action: replace + targetLabel: cluster + replacement: brusnika-prod + - action: replace + targetLabel: source_cluster + replacement: brusnika-prod + selector: + matchLabels: + app: camunda-platform + app.kubernetes.io/component: identity +--- +apiVersion: monitoring.coreos.com/v1 +kind: ServiceMonitor +metadata: + name: camunda-camunda-prod-operate + namespace: camunda + labels: + app: camunda-platform + app.kubernetes.io/instance: camunda + app.kubernetes.io/name: camunda-prod + app.kubernetes.io/part-of: camunda-platform + release: metrics +spec: + endpoints: + - honorLabels: true + interval: 10s + path: /actuator/prometheus + port: management + relabelings: + - action: replace + targetLabel: cluster + replacement: brusnika-prod + - action: replace + targetLabel: source_cluster + replacement: brusnika-prod + selector: + matchLabels: + app: camunda-platform + app.kubernetes.io/component: operate +--- +apiVersion: monitoring.coreos.com/v1 +kind: ServiceMonitor +metadata: + name: camunda-camunda-prod-optimize + namespace: camunda + labels: + app: camunda-platform + app.kubernetes.io/instance: camunda + app.kubernetes.io/name: camunda-prod + app.kubernetes.io/part-of: camunda-platform + release: metrics +spec: + endpoints: + - honorLabels: true + interval: 10s + path: /actuator/prometheus + port: management + relabelings: + - action: replace + targetLabel: cluster + replacement: brusnika-prod + - action: replace + targetLabel: source_cluster + replacement: brusnika-prod + selector: + matchLabels: + app: camunda-platform + app.kubernetes.io/component: optimize +--- +apiVersion: monitoring.coreos.com/v1 +kind: ServiceMonitor +metadata: + name: camunda-camunda-prod-tasklist + namespace: camunda + labels: + app: camunda-platform + app.kubernetes.io/instance: camunda + app.kubernetes.io/name: camunda-prod + app.kubernetes.io/part-of: camunda-platform + release: metrics +spec: + endpoints: + - honorLabels: true + interval: 10s + path: /actuator/prometheus + port: management + relabelings: + - action: replace + targetLabel: cluster + replacement: brusnika-prod + - action: replace + targetLabel: source_cluster + replacement: brusnika-prod + selector: + matchLabels: + app: camunda-platform + app.kubernetes.io/component: tasklist +--- +apiVersion: monitoring.coreos.com/v1 +kind: ServiceMonitor +metadata: + name: camunda-camunda-prod-zeebe + namespace: camunda + labels: + app: camunda-platform + app.kubernetes.io/instance: camunda + app.kubernetes.io/name: camunda-prod + app.kubernetes.io/part-of: camunda-platform + release: metrics +spec: + endpoints: + - honorLabels: true + interval: 10s + path: /actuator/prometheus + port: http + relabelings: + - action: replace + targetLabel: cluster + replacement: brusnika-prod + - action: replace + targetLabel: source_cluster + replacement: brusnika-prod + selector: + matchLabels: + app: camunda-platform + app.kubernetes.io/component: zeebe-broker +--- +apiVersion: monitoring.coreos.com/v1 +kind: ServiceMonitor +metadata: + name: camunda-camunda-prod-zeebe-gateway + namespace: camunda + labels: + app: camunda-platform + app.kubernetes.io/instance: camunda + app.kubernetes.io/name: camunda-prod + app.kubernetes.io/part-of: camunda-platform + release: metrics +spec: + endpoints: + - honorLabels: true + interval: 10s + path: /actuator/prometheus + port: http + relabelings: + - action: replace + targetLabel: cluster + replacement: brusnika-prod + - action: replace + targetLabel: source_cluster + replacement: brusnika-prod + selector: + matchLabels: + app: camunda-platform + app.kubernetes.io/component: zeebe-gateway diff --git a/clusters/brusnika-prod/infrastructure/kafka-exporter-yc-dashboard.yaml b/clusters/brusnika-prod/infrastructure/kafka-exporter-yc-dashboard.yaml index b0ffb1b..6fe782b 100644 --- a/clusters/brusnika-prod/infrastructure/kafka-exporter-yc-dashboard.yaml +++ b/clusters/brusnika-prod/infrastructure/kafka-exporter-yc-dashboard.yaml @@ -9,4 +9,4 @@ metadata: grafana_folder: Kafka data: kafka-exporter-yc.json: |- - {"annotations":{"list":[{"builtIn":1,"datasource":{"type":"grafana","uid":"-- Grafana --"},"enable":true,"hide":true,"iconColor":"rgba(0, 211, 255, 1)","name":"Annotations & Alerts","type":"dashboard"}]},"editable":true,"fiscalYearStartMonth":0,"graphTooltip":0,"links":[],"panels":[{"id":1,"type":"row","title":"YC Kafka exporter","collapsed":false,"gridPos":{"h":1,"w":24,"x":0,"y":0},"panels":[]},{"id":2,"type":"stat","title":"Exporter up","gridPos":{"h":4,"w":6,"x":0,"y":1},"datasource":{"type":"prometheus","uid":"${ds_datasource}"},"targets":[{"datasource":{"type":"prometheus","uid":"${ds_datasource}"},"editorMode":"code","expr":"max(up{kafka_instance=\"yc-kafka\",cluster=\"brusnika-prod\"})","refId":"A","range":false,"instant":true}],"fieldConfig":{"defaults":{"unit":"short","mappings":[],"thresholds":{"mode":"absolute","steps":[{"color":"red"},{"color":"green","value":1}]}},"overrides":[]},"options":{"colorMode":"background","graphMode":"none","justifyMode":"auto","orientation":"auto","reduceOptions":{"calcs":["lastNotNull"],"fields":"","values":false},"textMode":"auto"}},{"id":3,"type":"stat","title":"Topics without new messages for 12h","gridPos":{"h":4,"w":9,"x":6,"y":1},"datasource":{"type":"prometheus","uid":"${ds_datasource}"},"targets":[{"datasource":{"type":"prometheus","uid":"${ds_datasource}"},"editorMode":"code","expr":"count(((sum by (topic) (max_over_time(kafka_topic_partition_current_offset{kafka_instance=\"yc-kafka\",cluster=\"brusnika-prod\",topic=~\"^(prod|system_log)\"}[12h]) - min_over_time(kafka_topic_partition_current_offset{kafka_instance=\"yc-kafka\",cluster=\"brusnika-prod\",topic=~\"^(prod|system_log)\"}[12h])) == 0) and on(topic) (min by (topic) (count_over_time(kafka_topic_partition_current_offset{kafka_instance=\"yc-kafka\",cluster=\"brusnika-prod\",topic=~\"^(prod|system_log)\"}[12h])) >= 1400) and on(topic) (sum by (topic) (kafka_topic_partitions{kafka_instance=\"yc-kafka\",cluster=\"brusnika-prod\",topic=~\"^(prod|system_log)\"}) > 0)))","refId":"A","range":false,"instant":true}],"fieldConfig":{"defaults":{"unit":"short","mappings":[],"thresholds":{"mode":"absolute","steps":[{"color":"green"},{"color":"orange","value":1},{"color":"red","value":5}]}},"overrides":[]},"options":{"colorMode":"background","graphMode":"area","justifyMode":"auto","orientation":"auto","reduceOptions":{"calcs":["lastNotNull"],"fields":"","values":false},"textMode":"auto"}},{"id":4,"type":"stat","title":"Known topics","gridPos":{"h":4,"w":9,"x":15,"y":1},"datasource":{"type":"prometheus","uid":"${ds_datasource}"},"targets":[{"datasource":{"type":"prometheus","uid":"${ds_datasource}"},"editorMode":"code","expr":"count(count by (topic) (kafka_topic_partitions{kafka_instance=\"yc-kafka\",cluster=\"brusnika-prod\",topic=~\"^(prod|system_log)\"}))","refId":"A","range":false,"instant":true}],"fieldConfig":{"defaults":{"unit":"short","mappings":[],"thresholds":{"mode":"absolute","steps":[{"color":"green"}]}},"overrides":[]},"options":{"colorMode":"value","graphMode":"none","justifyMode":"auto","orientation":"auto","reduceOptions":{"calcs":["lastNotNull"],"fields":"","values":false},"textMode":"auto"}},{"id":5,"type":"table","title":"Topics with no offset growth for 12h","gridPos":{"h":8,"w":24,"x":0,"y":5},"datasource":{"type":"prometheus","uid":"${ds_datasource}"},"targets":[{"datasource":{"type":"prometheus","uid":"${ds_datasource}"},"editorMode":"code","expr":"((sum by (topic, kafka_instance, cluster) (max_over_time(kafka_topic_partition_current_offset{kafka_instance=\"yc-kafka\",cluster=\"brusnika-prod\",topic=~\"^(prod|system_log)\"}[12h]) - min_over_time(kafka_topic_partition_current_offset{kafka_instance=\"yc-kafka\",cluster=\"brusnika-prod\",topic=~\"^(prod|system_log)\"}[12h])) == 0) and on(topic) (min by (topic) (count_over_time(kafka_topic_partition_current_offset{kafka_instance=\"yc-kafka\",cluster=\"brusnika-prod\",topic=~\"^(prod|system_log)\"}[12h])) >= 1400) and on(topic) (sum by (topic) (kafka_topic_partitions{kafka_instance=\"yc-kafka\",cluster=\"brusnika-prod\",topic=~\"^(prod|system_log)\"}) > 0))","refId":"A","range":false,"instant":true}],"fieldConfig":{"defaults":{"custom":{"align":"auto","cellOptions":{"type":"auto"}},"mappings":[],"thresholds":{"mode":"absolute","steps":[{"color":"green"}]}},"overrides":[]},"options":{"showHeader":true,"cellHeight":"sm"},"transformations":[{"id":"labelsToFields","options":{"mode":"columns"}}]},{"id":10,"type":"row","title":"Topic traffic","collapsed":false,"gridPos":{"h":1,"w":24,"x":0,"y":13},"panels":[]},{"id":11,"type":"timeseries","title":"Topic offset delta by $__rate_interval","gridPos":{"h":8,"w":12,"x":0,"y":14},"datasource":{"type":"prometheus","uid":"${ds_datasource}"},"targets":[{"datasource":{"type":"prometheus","uid":"${ds_datasource}"},"editorMode":"code","expr":"sum by (topic) (clamp_min(delta(kafka_topic_partition_current_offset{kafka_instance=\"yc-kafka\",cluster=\"brusnika-prod\",topic=~\"$topic\"}[$__rate_interval]), 0))","refId":"A","range":true,"legendFormat":"{{topic}}"}],"fieldConfig":{"defaults":{"unit":"short","custom":{"drawStyle":"line","lineInterpolation":"linear","lineWidth":1,"fillOpacity":10,"spanNulls":false,"showPoints":"auto","axisPlacement":"auto","scaleDistribution":{"type":"linear"},"hideFrom":{"legend":false,"tooltip":false,"viz":false}},"mappings":[],"thresholds":{"mode":"absolute","steps":[{"color":"green"}]}},"overrides":[]},"options":{"legend":{"displayMode":"list","placement":"bottom","showLegend":true},"tooltip":{"mode":"multi","sort":"desc"}}},{"id":12,"type":"timeseries","title":"Topic offset delta over 12h","gridPos":{"h":8,"w":12,"x":12,"y":14},"datasource":{"type":"prometheus","uid":"${ds_datasource}"},"targets":[{"datasource":{"type":"prometheus","uid":"${ds_datasource}"},"editorMode":"code","expr":"sum by (topic) (max_over_time(kafka_topic_partition_current_offset{kafka_instance=\"yc-kafka\",cluster=\"brusnika-prod\",topic=~\"$topic\"}[12h]) - min_over_time(kafka_topic_partition_current_offset{kafka_instance=\"yc-kafka\",cluster=\"brusnika-prod\",topic=~\"$topic\"}[12h]))","refId":"A","range":true,"legendFormat":"{{topic}}"}],"fieldConfig":{"defaults":{"unit":"short","custom":{"drawStyle":"line","lineInterpolation":"linear","lineWidth":1,"fillOpacity":10,"spanNulls":false,"showPoints":"auto","axisPlacement":"auto","scaleDistribution":{"type":"linear"},"hideFrom":{"legend":false,"tooltip":false,"viz":false}},"mappings":[],"thresholds":{"mode":"absolute","steps":[{"color":"green"}]}},"overrides":[]},"options":{"legend":{"displayMode":"list","placement":"bottom","showLegend":true},"tooltip":{"mode":"multi","sort":"desc"}}},{"id":13,"type":"timeseries","title":"Current topic offset","gridPos":{"h":8,"w":12,"x":0,"y":22},"datasource":{"type":"prometheus","uid":"${ds_datasource}"},"targets":[{"datasource":{"type":"prometheus","uid":"${ds_datasource}"},"editorMode":"code","expr":"sum by (topic) (kafka_topic_partition_current_offset{kafka_instance=\"yc-kafka\",cluster=\"brusnika-prod\",topic=~\"$topic\"})","refId":"A","range":true,"legendFormat":"{{topic}}"}],"fieldConfig":{"defaults":{"unit":"short","custom":{"drawStyle":"line","lineInterpolation":"linear","lineWidth":1,"fillOpacity":10,"spanNulls":false,"showPoints":"auto","axisPlacement":"auto","scaleDistribution":{"type":"linear"},"hideFrom":{"legend":false,"tooltip":false,"viz":false}},"mappings":[],"thresholds":{"mode":"absolute","steps":[{"color":"green"}]}},"overrides":[]},"options":{"legend":{"displayMode":"list","placement":"bottom","showLegend":true},"tooltip":{"mode":"multi","sort":"desc"}}},{"id":14,"type":"bargauge","title":"Partitions by topic","gridPos":{"h":8,"w":12,"x":12,"y":22},"datasource":{"type":"prometheus","uid":"${ds_datasource}"},"targets":[{"datasource":{"type":"prometheus","uid":"${ds_datasource}"},"editorMode":"code","expr":"sum by (topic) (kafka_topic_partitions{kafka_instance=\"yc-kafka\",cluster=\"brusnika-prod\",topic=~\"$topic\"})","refId":"A","range":false,"legendFormat":"{{topic}}","instant":true}],"fieldConfig":{"defaults":{"unit":"short","mappings":[],"thresholds":{"mode":"absolute","steps":[{"color":"green"}]}},"overrides":[]},"options":{"displayMode":"gradient","legend":{"displayMode":"list","placement":"bottom","showLegend":false},"orientation":"horizontal","reduceOptions":{"calcs":["lastNotNull"],"fields":"","values":false},"showUnfilled":true}},{"id":20,"type":"row","title":"Consumer groups","collapsed":false,"gridPos":{"h":1,"w":24,"x":0,"y":30},"panels":[]},{"id":21,"type":"timeseries","title":"Consumer lag by group/topic","gridPos":{"h":8,"w":24,"x":0,"y":31},"datasource":{"type":"prometheus","uid":"${ds_datasource}"},"targets":[{"datasource":{"type":"prometheus","uid":"${ds_datasource}"},"editorMode":"code","expr":"sum by (consumergroup, topic) (kafka_consumergroup_lag_sum{kafka_instance=\"yc-kafka\",cluster=\"brusnika-prod\",topic=~\"$topic\"})","refId":"A","range":true,"legendFormat":"{{consumergroup}} / {{topic}}"}],"fieldConfig":{"defaults":{"unit":"short","custom":{"drawStyle":"line","lineInterpolation":"linear","lineWidth":1,"fillOpacity":10,"spanNulls":false,"showPoints":"auto","axisPlacement":"auto","scaleDistribution":{"type":"linear"},"hideFrom":{"legend":false,"tooltip":false,"viz":false}},"mappings":[],"thresholds":{"mode":"absolute","steps":[{"color":"green"}]}},"overrides":[]},"options":{"legend":{"displayMode":"list","placement":"bottom","showLegend":true},"tooltip":{"mode":"multi","sort":"desc"}}}],"preload":false,"refresh":"30s","schemaVersion":41,"tags":["kafka","kafka-exporter","brusnika-prod"],"templating":{"list":[{"current":{"text":"VictoriaMetrics","value":"vm"},"includeAll":false,"label":"Data Source","name":"ds_datasource","options":[],"query":"prometheus","refresh":1,"regex":"","type":"datasource"},{"current":{"text":"All","value":"$__all"},"datasource":{"type":"prometheus","uid":"${ds_datasource}"},"definition":"label_values(kafka_topic_partition_current_offset{kafka_instance=\"yc-kafka\",cluster=\"brusnika-prod\",topic=~\"^(prod|system_log)\"},topic)","allValue":".*","includeAll":true,"label":"Topic","name":"topic","options":[],"query":{"qryType":1,"query":"label_values(kafka_topic_partition_current_offset{kafka_instance=\"yc-kafka\",cluster=\"brusnika-prod\",topic=~\"^(prod|system_log)\"},topic)","refId":"PrometheusVariableQueryEditor-VariableQuery"},"refresh":1,"regex":"","type":"query"}]},"time":{"from":"now-12h","to":"now"},"timepicker":{},"timezone":"","title":"Kafka Exporter / YC Kafka","uid":"kafka-exporter-yc","version":1} + {"annotations":{"list":[{"builtIn":1,"datasource":{"type":"grafana","uid":"-- Grafana --"},"enable":true,"hide":true,"iconColor":"rgba(0, 211, 255, 1)","name":"Annotations & Alerts","type":"dashboard"}]},"editable":true,"fiscalYearStartMonth":0,"graphTooltip":0,"links":[],"panels":[{"id":1,"type":"row","title":"YC Kafka exporter","collapsed":false,"gridPos":{"h":1,"w":24,"x":0,"y":0},"panels":[]},{"id":2,"type":"stat","title":"Exporter up","gridPos":{"h":4,"w":6,"x":0,"y":1},"datasource":{"type":"prometheus","uid":"${ds_datasource}"},"targets":[{"datasource":{"type":"prometheus","uid":"${ds_datasource}"},"editorMode":"code","expr":"max(up{kafka_instance=\"yc-kafka\",cluster=\"brusnika-prod\"})","refId":"A","range":false,"instant":true}],"fieldConfig":{"defaults":{"unit":"short","mappings":[],"thresholds":{"mode":"absolute","steps":[{"color":"red"},{"color":"green","value":1}]}},"overrides":[]},"options":{"colorMode":"background","graphMode":"none","justifyMode":"auto","orientation":"auto","reduceOptions":{"calcs":["lastNotNull"],"fields":"","values":false},"textMode":"auto"}},{"id":3,"type":"stat","title":"Topics without new messages for 12h","gridPos":{"h":4,"w":9,"x":6,"y":1},"datasource":{"type":"prometheus","uid":"${ds_datasource}"},"targets":[{"datasource":{"type":"prometheus","uid":"${ds_datasource}"},"editorMode":"code","expr":"count(((sum by (topic) (max_over_time(kafka_topic_partition_current_offset{kafka_instance=\"yc-kafka\",cluster=\"brusnika-prod\",topic=~\"^(bru\\.cde\\.folders\\.prod|system-log-prod)$\"}[12h]) - min_over_time(kafka_topic_partition_current_offset{kafka_instance=\"yc-kafka\",cluster=\"brusnika-prod\",topic=~\"^(bru\\.cde\\.folders\\.prod|system-log-prod)$\"}[12h])) == 0) and on(topic) (min by (topic) (count_over_time(kafka_topic_partition_current_offset{kafka_instance=\"yc-kafka\",cluster=\"brusnika-prod\",topic=~\"^(bru\\.cde\\.folders\\.prod|system-log-prod)$\"}[12h])) >= 1400) and on(topic) (sum by (topic) (kafka_topic_partitions{kafka_instance=\"yc-kafka\",cluster=\"brusnika-prod\",topic=~\"^(bru\\.cde\\.folders\\.prod|system-log-prod)$\"}) > 0)))","refId":"A","range":false,"instant":true}],"fieldConfig":{"defaults":{"unit":"short","mappings":[],"thresholds":{"mode":"absolute","steps":[{"color":"green"},{"color":"orange","value":1},{"color":"red","value":5}]}},"overrides":[]},"options":{"colorMode":"background","graphMode":"area","justifyMode":"auto","orientation":"auto","reduceOptions":{"calcs":["lastNotNull"],"fields":"","values":false},"textMode":"auto"}},{"id":4,"type":"stat","title":"Known topics","gridPos":{"h":4,"w":9,"x":15,"y":1},"datasource":{"type":"prometheus","uid":"${ds_datasource}"},"targets":[{"datasource":{"type":"prometheus","uid":"${ds_datasource}"},"editorMode":"code","expr":"count(count by (topic) (kafka_topic_partitions{kafka_instance=\"yc-kafka\",cluster=\"brusnika-prod\",topic=~\"^(bru\\.cde\\.folders\\.prod|system-log-prod)$\"}))","refId":"A","range":false,"instant":true}],"fieldConfig":{"defaults":{"unit":"short","mappings":[],"thresholds":{"mode":"absolute","steps":[{"color":"green"}]}},"overrides":[]},"options":{"colorMode":"value","graphMode":"none","justifyMode":"auto","orientation":"auto","reduceOptions":{"calcs":["lastNotNull"],"fields":"","values":false},"textMode":"auto"}},{"id":5,"type":"table","title":"Topics with no offset growth for 12h","gridPos":{"h":8,"w":24,"x":0,"y":5},"datasource":{"type":"prometheus","uid":"${ds_datasource}"},"targets":[{"datasource":{"type":"prometheus","uid":"${ds_datasource}"},"editorMode":"code","expr":"((sum by (topic, kafka_instance, cluster) (max_over_time(kafka_topic_partition_current_offset{kafka_instance=\"yc-kafka\",cluster=\"brusnika-prod\",topic=~\"^(bru\\.cde\\.folders\\.prod|system-log-prod)$\"}[12h]) - min_over_time(kafka_topic_partition_current_offset{kafka_instance=\"yc-kafka\",cluster=\"brusnika-prod\",topic=~\"^(bru\\.cde\\.folders\\.prod|system-log-prod)$\"}[12h])) == 0) and on(topic) (min by (topic) (count_over_time(kafka_topic_partition_current_offset{kafka_instance=\"yc-kafka\",cluster=\"brusnika-prod\",topic=~\"^(bru\\.cde\\.folders\\.prod|system-log-prod)$\"}[12h])) >= 1400) and on(topic) (sum by (topic) (kafka_topic_partitions{kafka_instance=\"yc-kafka\",cluster=\"brusnika-prod\",topic=~\"^(bru\\.cde\\.folders\\.prod|system-log-prod)$\"}) > 0))","refId":"A","range":false,"instant":true}],"fieldConfig":{"defaults":{"custom":{"align":"auto","cellOptions":{"type":"auto"}},"mappings":[],"thresholds":{"mode":"absolute","steps":[{"color":"green"}]}},"overrides":[]},"options":{"showHeader":true,"cellHeight":"sm"},"transformations":[{"id":"labelsToFields","options":{"mode":"columns"}}]},{"id":10,"type":"row","title":"Topic traffic","collapsed":false,"gridPos":{"h":1,"w":24,"x":0,"y":13},"panels":[]},{"id":11,"type":"timeseries","title":"Topic offset delta by $__rate_interval","gridPos":{"h":8,"w":12,"x":0,"y":14},"datasource":{"type":"prometheus","uid":"${ds_datasource}"},"targets":[{"datasource":{"type":"prometheus","uid":"${ds_datasource}"},"editorMode":"code","expr":"sum by (topic) (clamp_min(delta(kafka_topic_partition_current_offset{kafka_instance=\"yc-kafka\",cluster=\"brusnika-prod\",topic=~\"$topic\"}[$__rate_interval]), 0))","refId":"A","range":true,"legendFormat":"{{topic}}"}],"fieldConfig":{"defaults":{"unit":"short","custom":{"drawStyle":"line","lineInterpolation":"linear","lineWidth":1,"fillOpacity":10,"spanNulls":false,"showPoints":"auto","axisPlacement":"auto","scaleDistribution":{"type":"linear"},"hideFrom":{"legend":false,"tooltip":false,"viz":false}},"mappings":[],"thresholds":{"mode":"absolute","steps":[{"color":"green"}]}},"overrides":[]},"options":{"legend":{"displayMode":"list","placement":"bottom","showLegend":true},"tooltip":{"mode":"multi","sort":"desc"}}},{"id":12,"type":"timeseries","title":"Topic offset delta over 12h","gridPos":{"h":8,"w":12,"x":12,"y":14},"datasource":{"type":"prometheus","uid":"${ds_datasource}"},"targets":[{"datasource":{"type":"prometheus","uid":"${ds_datasource}"},"editorMode":"code","expr":"sum by (topic) (max_over_time(kafka_topic_partition_current_offset{kafka_instance=\"yc-kafka\",cluster=\"brusnika-prod\",topic=~\"$topic\"}[12h]) - min_over_time(kafka_topic_partition_current_offset{kafka_instance=\"yc-kafka\",cluster=\"brusnika-prod\",topic=~\"$topic\"}[12h]))","refId":"A","range":true,"legendFormat":"{{topic}}"}],"fieldConfig":{"defaults":{"unit":"short","custom":{"drawStyle":"line","lineInterpolation":"linear","lineWidth":1,"fillOpacity":10,"spanNulls":false,"showPoints":"auto","axisPlacement":"auto","scaleDistribution":{"type":"linear"},"hideFrom":{"legend":false,"tooltip":false,"viz":false}},"mappings":[],"thresholds":{"mode":"absolute","steps":[{"color":"green"}]}},"overrides":[]},"options":{"legend":{"displayMode":"list","placement":"bottom","showLegend":true},"tooltip":{"mode":"multi","sort":"desc"}}},{"id":13,"type":"timeseries","title":"Current topic offset","gridPos":{"h":8,"w":12,"x":0,"y":22},"datasource":{"type":"prometheus","uid":"${ds_datasource}"},"targets":[{"datasource":{"type":"prometheus","uid":"${ds_datasource}"},"editorMode":"code","expr":"sum by (topic) (kafka_topic_partition_current_offset{kafka_instance=\"yc-kafka\",cluster=\"brusnika-prod\",topic=~\"$topic\"})","refId":"A","range":true,"legendFormat":"{{topic}}"}],"fieldConfig":{"defaults":{"unit":"short","custom":{"drawStyle":"line","lineInterpolation":"linear","lineWidth":1,"fillOpacity":10,"spanNulls":false,"showPoints":"auto","axisPlacement":"auto","scaleDistribution":{"type":"linear"},"hideFrom":{"legend":false,"tooltip":false,"viz":false}},"mappings":[],"thresholds":{"mode":"absolute","steps":[{"color":"green"}]}},"overrides":[]},"options":{"legend":{"displayMode":"list","placement":"bottom","showLegend":true},"tooltip":{"mode":"multi","sort":"desc"}}},{"id":14,"type":"bargauge","title":"Partitions by topic","gridPos":{"h":8,"w":12,"x":12,"y":22},"datasource":{"type":"prometheus","uid":"${ds_datasource}"},"targets":[{"datasource":{"type":"prometheus","uid":"${ds_datasource}"},"editorMode":"code","expr":"sum by (topic) (kafka_topic_partitions{kafka_instance=\"yc-kafka\",cluster=\"brusnika-prod\",topic=~\"$topic\"})","refId":"A","range":false,"legendFormat":"{{topic}}","instant":true}],"fieldConfig":{"defaults":{"unit":"short","mappings":[],"thresholds":{"mode":"absolute","steps":[{"color":"green"}]}},"overrides":[]},"options":{"displayMode":"gradient","legend":{"displayMode":"list","placement":"bottom","showLegend":false},"orientation":"horizontal","reduceOptions":{"calcs":["lastNotNull"],"fields":"","values":false},"showUnfilled":true}},{"id":20,"type":"row","title":"Consumer groups","collapsed":false,"gridPos":{"h":1,"w":24,"x":0,"y":30},"panels":[]},{"id":21,"type":"timeseries","title":"Consumer lag by group/topic","gridPos":{"h":8,"w":24,"x":0,"y":31},"datasource":{"type":"prometheus","uid":"${ds_datasource}"},"targets":[{"datasource":{"type":"prometheus","uid":"${ds_datasource}"},"editorMode":"code","expr":"sum by (consumergroup, topic) (kafka_consumergroup_lag_sum{kafka_instance=\"yc-kafka\",cluster=\"brusnika-prod\",topic=~\"$topic\"})","refId":"A","range":true,"legendFormat":"{{consumergroup}} / {{topic}}"}],"fieldConfig":{"defaults":{"unit":"short","custom":{"drawStyle":"line","lineInterpolation":"linear","lineWidth":1,"fillOpacity":10,"spanNulls":false,"showPoints":"auto","axisPlacement":"auto","scaleDistribution":{"type":"linear"},"hideFrom":{"legend":false,"tooltip":false,"viz":false}},"mappings":[],"thresholds":{"mode":"absolute","steps":[{"color":"green"}]}},"overrides":[]},"options":{"legend":{"displayMode":"list","placement":"bottom","showLegend":true},"tooltip":{"mode":"multi","sort":"desc"}}}],"preload":false,"refresh":"30s","schemaVersion":41,"tags":["kafka","kafka-exporter","brusnika-prod"],"templating":{"list":[{"current":{"text":"VictoriaMetrics","value":"vm"},"includeAll":false,"label":"Data Source","name":"ds_datasource","options":[],"query":"prometheus","refresh":1,"regex":"","type":"datasource"},{"current":{"text":"All","value":"$__all"},"datasource":{"type":"prometheus","uid":"${ds_datasource}"},"definition":"label_values(kafka_topic_partition_current_offset{kafka_instance=\"yc-kafka\",cluster=\"brusnika-prod\",topic=~\"^(bru\\.cde\\.folders\\.prod|system-log-prod)$\"},topic)","allValue":".*","includeAll":true,"label":"Topic","name":"topic","options":[],"query":{"qryType":1,"query":"label_values(kafka_topic_partition_current_offset{kafka_instance=\"yc-kafka\",cluster=\"brusnika-prod\",topic=~\"^(bru\\.cde\\.folders\\.prod|system-log-prod)$\"},topic)","refId":"PrometheusVariableQueryEditor-VariableQuery"},"refresh":1,"regex":"","type":"query"}]},"time":{"from":"now-12h","to":"now"},"timepicker":{},"timezone":"","title":"Kafka Exporter / YC Kafka","uid":"kafka-exporter-yc","version":1} diff --git a/clusters/brusnika-prod/infrastructure/kafka-exporter-yc-rules.yaml b/clusters/brusnika-prod/infrastructure/kafka-exporter-yc-rules.yaml index 95dd180..465c374 100644 --- a/clusters/brusnika-prod/infrastructure/kafka-exporter-yc-rules.yaml +++ b/clusters/brusnika-prod/infrastructure/kafka-exporter-yc-rules.yaml @@ -16,38 +16,38 @@ spec: team: infra cluster: brusnika-prod kafka_instance: yc-kafka - source_cluster: yc-kafka + source_cluster: brusnika-prod annotations: summary: YC Kafka exporter is down in brusnika-prod description: No healthy kafka-exporter-yc target is scraped for 10 minutes. - - alert: KafkaTopicNoMessagesFor12h + - alert: KafkaTopicNoMessages12h expr: | ( sum by (topic, kafka_instance, source_cluster, cluster) ( - max_over_time(kafka_topic_partition_current_offset{kafka_instance="yc-kafka", cluster="brusnika-prod", topic=~"^(prod|system_log)$"}[12h]) + max_over_time(kafka_topic_partition_current_offset{kafka_instance="yc-kafka", cluster="brusnika-prod", topic=~"^(bru\\.cde\\.folders\\.prod|system-log-prod)$"}[12h]) - - min_over_time(kafka_topic_partition_current_offset{kafka_instance="yc-kafka", cluster="brusnika-prod", topic=~"^(prod|system_log)$"}[12h]) + min_over_time(kafka_topic_partition_current_offset{kafka_instance="yc-kafka", cluster="brusnika-prod", topic=~"^(bru\\.cde\\.folders\\.prod|system-log-prod)$"}[12h]) ) == 0 ) and on (topic, kafka_instance, source_cluster, cluster) ( min by (topic, kafka_instance, source_cluster, cluster) ( - count_over_time(kafka_topic_partition_current_offset{kafka_instance="yc-kafka", cluster="brusnika-prod", topic=~"^(prod|system_log)$"}[12h]) + count_over_time(kafka_topic_partition_current_offset{kafka_instance="yc-kafka", cluster="brusnika-prod", topic=~"^(bru\\.cde\\.folders\\.prod|system-log-prod)$"}[12h]) ) >= 1400 ) and on (topic, kafka_instance, source_cluster, cluster) ( sum by (topic, kafka_instance, source_cluster, cluster) ( - kafka_topic_partitions{kafka_instance="yc-kafka", cluster="brusnika-prod", topic=~"^(prod|system_log)$"} + kafka_topic_partitions{kafka_instance="yc-kafka", cluster="brusnika-prod", topic=~"^(bru\\.cde\\.folders\\.prod|system-log-prod)$"} ) > 0 ) for: 5m labels: - severity: warning + severity: critical team: infra cluster: brusnika-prod kafka_instance: yc-kafka - source_cluster: yc-kafka + source_cluster: brusnika-prod annotations: summary: No new messages in Kafka topic for 12h - description: Topic {{ $labels.topic }} in {{ $labels.kafka_instance }} has no offset growth for 12 hours. + description: Kafka topic {{ $labels.topic }} has no offset growth for 12h in brusnika-prod diff --git a/clusters/brusnika-prod/infrastructure/kustomization.yaml b/clusters/brusnika-prod/infrastructure/kustomization.yaml index f39b6d2..4e874ec 100644 --- a/clusters/brusnika-prod/infrastructure/kustomization.yaml +++ b/clusters/brusnika-prod/infrastructure/kustomization.yaml @@ -21,6 +21,8 @@ resources: - ./node-exporter-vmnodescrape.yaml - ./istio-gateway-stats-scrape.yaml - ./istio-dashboard-compat-vmrule.yaml + - ./camunda-servicemonitors.yaml + - ./rabbitmq-exporter.yaml - ./kafka-exporter-yc.yaml - ./kafka-exporter-yc-rules.yaml - ./kafka-exporter-yc-dashboard.yaml diff --git a/clusters/brusnika-prod/infrastructure/patches/postgres-exporter.yaml b/clusters/brusnika-prod/infrastructure/patches/postgres-exporter.yaml index 211cbc9..4cf8de1 100644 --- a/clusters/brusnika-prod/infrastructure/patches/postgres-exporter.yaml +++ b/clusters/brusnika-prod/infrastructure/patches/postgres-exporter.yaml @@ -29,45 +29,11 @@ spec: database: postgres sslmode: disable datasources: - - name: attachments - uri: 192.168.10.8:5432/attachments_db?sslmode=disable - - name: bim - uri: 192.168.10.8:5432/bimapidb?sslmode=disable - - name: comparisons - uri: 192.168.10.8:5432/comparisons_db?sslmode=disable - - name: django - uri: 192.168.10.8:5432/sarex_db?sslmode=disable - - name: documentations - uri: 192.168.10.8:5432/documentations?sslmode=disable - - name: drawings - uri: 192.168.10.8:5432/drawings?sslmode=disable - - name: eav - uri: 192.168.10.8:5432/eav?sslmode=disable - - name: flows - uri: 192.168.10.8:5432/flows_db?sslmode=disable - - name: inspections - uri: 192.168.10.8:5432/inspections_db?sslmode=disable - - name: issues - uri: 192.168.10.8:5432/issues?sslmode=disable - - name: notes - uri: 192.168.10.8:5432/notes_db?sslmode=disable + - name: goalert + uri: 192.168.10.8:5432/goalert?sslmode=disable - name: openobserve uri: 192.168.10.8:5432/openobserve?sslmode=disable - name: postgres uri: 192.168.10.8:5432/postgres?sslmode=disable - - name: resources - uri: 192.168.10.8:5432/resources?sslmode=disable - - name: rfi - uri: 192.168.10.8:5432/rfi_db?sslmode=disable - - name: subscriptions - uri: 192.168.10.8:5432/subscriptions?sslmode=disable - - name: system-log - uri: 192.168.10.8:5432/system_log?sslmode=disable - - name: transmittal - uri: 192.168.10.8:5432/transmittal_db?sslmode=disable - - name: workflow - uri: 192.168.10.8:5432/workflows_db?sslmode=disable - - name: workspaces - uri: 192.168.10.8:5432/workspaces_db?sslmode=disable - name: zitadel uri: 192.168.10.8:5432/zitadel?sslmode=disable diff --git a/clusters/brusnika-prod/infrastructure/rabbitmq-exporter.yaml b/clusters/brusnika-prod/infrastructure/rabbitmq-exporter.yaml new file mode 100644 index 0000000..790522f --- /dev/null +++ b/clusters/brusnika-prod/infrastructure/rabbitmq-exporter.yaml @@ -0,0 +1,90 @@ +apiVersion: apps/v1 +kind: Deployment +metadata: + name: rabbitmq-exporter + namespace: workflow + labels: + app: rabbitmq-exporter +spec: + replicas: 1 + selector: + matchLabels: + app: rabbitmq-exporter + template: + metadata: + labels: + app: rabbitmq-exporter + spec: + containers: + - name: rabbitmq-exporter + image: kbudde/rabbitmq-exporter:1.0.0-RC19 + imagePullPolicy: IfNotPresent + env: + - name: RABBIT_URL + value: http://rabbitmq-service.workflow.svc.cluster.local:15672 + - name: RABBIT_USER + valueFrom: + secretKeyRef: + name: rabbitmq-secret + key: username + - name: RABBIT_PASSWORD + valueFrom: + secretKeyRef: + name: rabbitmq-secret + key: password + - name: PUBLISH_PORT + value: "9419" + - name: SKIPVERIFY + value: "true" + ports: + - name: metrics + containerPort: 9419 + resources: + requests: + cpu: 25m + memory: 64Mi + limits: + memory: 128Mi +--- +apiVersion: v1 +kind: Service +metadata: + name: rabbitmq-exporter + namespace: workflow + labels: + app: rabbitmq-exporter +spec: + type: ClusterIP + selector: + app: rabbitmq-exporter + ports: + - name: metrics + port: 9419 + targetPort: metrics +--- +apiVersion: monitoring.coreos.com/v1 +kind: ServiceMonitor +metadata: + name: rabbitmq-exporter + namespace: workflow + labels: + app: rabbitmq-exporter + release: prometheus +spec: + endpoints: + - interval: 30s + path: /metrics + port: metrics + relabelings: + - action: replace + targetLabel: cluster + replacement: brusnika-prod + - action: replace + targetLabel: source_cluster + replacement: brusnika-prod + - action: replace + targetLabel: rabbitmq_instance + replacement: workflow-rabbitmq + selector: + matchLabels: + app: rabbitmq-exporter diff --git a/clusters/brusnika-stage/infrastructure/camunda-servicemonitors.yaml b/clusters/brusnika-stage/infrastructure/camunda-servicemonitors.yaml new file mode 100644 index 0000000..a715f7b --- /dev/null +++ b/clusters/brusnika-stage/infrastructure/camunda-servicemonitors.yaml @@ -0,0 +1,202 @@ +apiVersion: monitoring.coreos.com/v1 +kind: ServiceMonitor +metadata: + name: camunda-camunda-stage-connectors + namespace: camunda + labels: + app: camunda-platform + app.kubernetes.io/instance: camunda + app.kubernetes.io/name: camunda-stage + app.kubernetes.io/part-of: camunda-platform + release: metrics +spec: + endpoints: + - honorLabels: true + interval: 10s + path: /actuator/prometheus + port: http + relabelings: + - action: replace + targetLabel: cluster + replacement: brusnika-stage + - action: replace + targetLabel: source_cluster + replacement: brusnika-stage + selector: + matchLabels: + app: camunda-platform + app.kubernetes.io/component: connectors +--- +apiVersion: monitoring.coreos.com/v1 +kind: ServiceMonitor +metadata: + name: camunda-camunda-stage-identity + namespace: camunda + labels: + app: camunda-platform + app.kubernetes.io/instance: camunda + app.kubernetes.io/name: camunda-stage + app.kubernetes.io/part-of: camunda-platform + release: metrics +spec: + endpoints: + - honorLabels: true + interval: 10s + path: /actuator/prometheus + port: metrics + relabelings: + - action: replace + targetLabel: cluster + replacement: brusnika-stage + - action: replace + targetLabel: source_cluster + replacement: brusnika-stage + selector: + matchLabels: + app: camunda-platform + app.kubernetes.io/component: identity +--- +apiVersion: monitoring.coreos.com/v1 +kind: ServiceMonitor +metadata: + name: camunda-camunda-stage-operate + namespace: camunda + labels: + app: camunda-platform + app.kubernetes.io/instance: camunda + app.kubernetes.io/name: camunda-stage + app.kubernetes.io/part-of: camunda-platform + release: metrics +spec: + endpoints: + - honorLabels: true + interval: 10s + path: /actuator/prometheus + port: management + relabelings: + - action: replace + targetLabel: cluster + replacement: brusnika-stage + - action: replace + targetLabel: source_cluster + replacement: brusnika-stage + selector: + matchLabels: + app: camunda-platform + app.kubernetes.io/component: operate +--- +apiVersion: monitoring.coreos.com/v1 +kind: ServiceMonitor +metadata: + name: camunda-camunda-stage-optimize + namespace: camunda + labels: + app: camunda-platform + app.kubernetes.io/instance: camunda + app.kubernetes.io/name: camunda-stage + app.kubernetes.io/part-of: camunda-platform + release: metrics +spec: + endpoints: + - honorLabels: true + interval: 10s + path: /actuator/prometheus + port: management + relabelings: + - action: replace + targetLabel: cluster + replacement: brusnika-stage + - action: replace + targetLabel: source_cluster + replacement: brusnika-stage + selector: + matchLabels: + app: camunda-platform + app.kubernetes.io/component: optimize +--- +apiVersion: monitoring.coreos.com/v1 +kind: ServiceMonitor +metadata: + name: camunda-camunda-stage-tasklist + namespace: camunda + labels: + app: camunda-platform + app.kubernetes.io/instance: camunda + app.kubernetes.io/name: camunda-stage + app.kubernetes.io/part-of: camunda-platform + release: metrics +spec: + endpoints: + - honorLabels: true + interval: 10s + path: /actuator/prometheus + port: management + relabelings: + - action: replace + targetLabel: cluster + replacement: brusnika-stage + - action: replace + targetLabel: source_cluster + replacement: brusnika-stage + selector: + matchLabels: + app: camunda-platform + app.kubernetes.io/component: tasklist +--- +apiVersion: monitoring.coreos.com/v1 +kind: ServiceMonitor +metadata: + name: camunda-camunda-stage-zeebe + namespace: camunda + labels: + app: camunda-platform + app.kubernetes.io/instance: camunda + app.kubernetes.io/name: camunda-stage + app.kubernetes.io/part-of: camunda-platform + release: metrics +spec: + endpoints: + - honorLabels: true + interval: 10s + path: /actuator/prometheus + port: http + relabelings: + - action: replace + targetLabel: cluster + replacement: brusnika-stage + - action: replace + targetLabel: source_cluster + replacement: brusnika-stage + selector: + matchLabels: + app: camunda-platform + app.kubernetes.io/component: zeebe-broker +--- +apiVersion: monitoring.coreos.com/v1 +kind: ServiceMonitor +metadata: + name: camunda-camunda-stage-zeebe-gateway + namespace: camunda + labels: + app: camunda-platform + app.kubernetes.io/instance: camunda + app.kubernetes.io/name: camunda-stage + app.kubernetes.io/part-of: camunda-platform + release: metrics +spec: + endpoints: + - honorLabels: true + interval: 10s + path: /actuator/prometheus + port: http + relabelings: + - action: replace + targetLabel: cluster + replacement: brusnika-stage + - action: replace + targetLabel: source_cluster + replacement: brusnika-stage + selector: + matchLabels: + app: camunda-platform + app.kubernetes.io/component: zeebe-gateway diff --git a/clusters/brusnika-stage/infrastructure/kafka-exporter-yc-dashboard.yaml b/clusters/brusnika-stage/infrastructure/kafka-exporter-yc-dashboard.yaml index 139bc79..29789da 100644 --- a/clusters/brusnika-stage/infrastructure/kafka-exporter-yc-dashboard.yaml +++ b/clusters/brusnika-stage/infrastructure/kafka-exporter-yc-dashboard.yaml @@ -9,4 +9,4 @@ metadata: grafana_folder: Kafka data: kafka-exporter-yc.json: |- - {"annotations":{"list":[{"builtIn":1,"datasource":{"type":"grafana","uid":"-- Grafana --"},"enable":true,"hide":true,"iconColor":"rgba(0, 211, 255, 1)","name":"Annotations & Alerts","type":"dashboard"}]},"editable":true,"fiscalYearStartMonth":0,"graphTooltip":0,"links":[],"panels":[{"id":1,"type":"row","title":"YC Kafka exporter","collapsed":false,"gridPos":{"h":1,"w":24,"x":0,"y":0},"panels":[]},{"id":2,"type":"stat","title":"Exporter up","gridPos":{"h":4,"w":6,"x":0,"y":1},"datasource":{"type":"prometheus","uid":"${ds_datasource}"},"targets":[{"datasource":{"type":"prometheus","uid":"${ds_datasource}"},"editorMode":"code","expr":"max(up{kafka_instance=\"yc-kafka\",cluster=\"brusnika-stage\"})","refId":"A","range":false,"instant":true}],"fieldConfig":{"defaults":{"unit":"short","mappings":[],"thresholds":{"mode":"absolute","steps":[{"color":"red"},{"color":"green","value":1}]}},"overrides":[]},"options":{"colorMode":"background","graphMode":"none","justifyMode":"auto","orientation":"auto","reduceOptions":{"calcs":["lastNotNull"],"fields":"","values":false},"textMode":"auto"}},{"id":3,"type":"stat","title":"Topics without new messages for 12h","gridPos":{"h":4,"w":9,"x":6,"y":1},"datasource":{"type":"prometheus","uid":"${ds_datasource}"},"targets":[{"datasource":{"type":"prometheus","uid":"${ds_datasource}"},"editorMode":"code","expr":"count(((sum by (topic) (max_over_time(kafka_topic_partition_current_offset{kafka_instance=\"yc-kafka\",cluster=\"brusnika-stage\",topic!~\"^__.*\"}[12h]) - min_over_time(kafka_topic_partition_current_offset{kafka_instance=\"yc-kafka\",cluster=\"brusnika-stage\",topic!~\"^__.*\"}[12h])) == 0) and on(topic) (min by (topic) (count_over_time(kafka_topic_partition_current_offset{kafka_instance=\"yc-kafka\",cluster=\"brusnika-stage\",topic!~\"^__.*\"}[12h])) >= 1400) and on(topic) (sum by (topic) (kafka_topic_partitions{kafka_instance=\"yc-kafka\",cluster=\"brusnika-stage\",topic!~\"^__.*\"}) > 0)))","refId":"A","range":false,"instant":true}],"fieldConfig":{"defaults":{"unit":"short","mappings":[],"thresholds":{"mode":"absolute","steps":[{"color":"green"},{"color":"orange","value":1},{"color":"red","value":5}]}},"overrides":[]},"options":{"colorMode":"background","graphMode":"area","justifyMode":"auto","orientation":"auto","reduceOptions":{"calcs":["lastNotNull"],"fields":"","values":false},"textMode":"auto"}},{"id":4,"type":"stat","title":"Known topics","gridPos":{"h":4,"w":9,"x":15,"y":1},"datasource":{"type":"prometheus","uid":"${ds_datasource}"},"targets":[{"datasource":{"type":"prometheus","uid":"${ds_datasource}"},"editorMode":"code","expr":"count(count by (topic) (kafka_topic_partitions{kafka_instance=\"yc-kafka\",cluster=\"brusnika-stage\",topic!~\"^__.*\"}))","refId":"A","range":false,"instant":true}],"fieldConfig":{"defaults":{"unit":"short","mappings":[],"thresholds":{"mode":"absolute","steps":[{"color":"green"}]}},"overrides":[]},"options":{"colorMode":"value","graphMode":"none","justifyMode":"auto","orientation":"auto","reduceOptions":{"calcs":["lastNotNull"],"fields":"","values":false},"textMode":"auto"}},{"id":5,"type":"table","title":"Topics with no offset growth for 12h","gridPos":{"h":8,"w":24,"x":0,"y":5},"datasource":{"type":"prometheus","uid":"${ds_datasource}"},"targets":[{"datasource":{"type":"prometheus","uid":"${ds_datasource}"},"editorMode":"code","expr":"((sum by (topic, kafka_instance, cluster) (max_over_time(kafka_topic_partition_current_offset{kafka_instance=\"yc-kafka\",cluster=\"brusnika-stage\",topic!~\"^__.*\"}[12h]) - min_over_time(kafka_topic_partition_current_offset{kafka_instance=\"yc-kafka\",cluster=\"brusnika-stage\",topic!~\"^__.*\"}[12h])) == 0) and on(topic) (min by (topic) (count_over_time(kafka_topic_partition_current_offset{kafka_instance=\"yc-kafka\",cluster=\"brusnika-stage\",topic!~\"^__.*\"}[12h])) >= 1400) and on(topic) (sum by (topic) (kafka_topic_partitions{kafka_instance=\"yc-kafka\",cluster=\"brusnika-stage\",topic!~\"^__.*\"}) > 0))","refId":"A","range":false,"instant":true}],"fieldConfig":{"defaults":{"custom":{"align":"auto","cellOptions":{"type":"auto"}},"mappings":[],"thresholds":{"mode":"absolute","steps":[{"color":"green"}]}},"overrides":[]},"options":{"showHeader":true,"cellHeight":"sm"},"transformations":[{"id":"labelsToFields","options":{"mode":"columns"}}]},{"id":10,"type":"row","title":"Topic traffic","collapsed":false,"gridPos":{"h":1,"w":24,"x":0,"y":13},"panels":[]},{"id":11,"type":"timeseries","title":"Topic offset delta by $__rate_interval","gridPos":{"h":8,"w":12,"x":0,"y":14},"datasource":{"type":"prometheus","uid":"${ds_datasource}"},"targets":[{"datasource":{"type":"prometheus","uid":"${ds_datasource}"},"editorMode":"code","expr":"sum by (topic) (clamp_min(delta(kafka_topic_partition_current_offset{kafka_instance=\"yc-kafka\",cluster=\"brusnika-stage\",topic=~\"$topic\"}[$__rate_interval]), 0))","refId":"A","range":true,"legendFormat":"{{topic}}"}],"fieldConfig":{"defaults":{"unit":"short","custom":{"drawStyle":"line","lineInterpolation":"linear","lineWidth":1,"fillOpacity":10,"spanNulls":false,"showPoints":"auto","axisPlacement":"auto","scaleDistribution":{"type":"linear"},"hideFrom":{"legend":false,"tooltip":false,"viz":false}},"mappings":[],"thresholds":{"mode":"absolute","steps":[{"color":"green"}]}},"overrides":[]},"options":{"legend":{"displayMode":"list","placement":"bottom","showLegend":true},"tooltip":{"mode":"multi","sort":"desc"}}},{"id":12,"type":"timeseries","title":"Topic offset delta over 12h","gridPos":{"h":8,"w":12,"x":12,"y":14},"datasource":{"type":"prometheus","uid":"${ds_datasource}"},"targets":[{"datasource":{"type":"prometheus","uid":"${ds_datasource}"},"editorMode":"code","expr":"sum by (topic) (max_over_time(kafka_topic_partition_current_offset{kafka_instance=\"yc-kafka\",cluster=\"brusnika-stage\",topic=~\"$topic\"}[12h]) - min_over_time(kafka_topic_partition_current_offset{kafka_instance=\"yc-kafka\",cluster=\"brusnika-stage\",topic=~\"$topic\"}[12h]))","refId":"A","range":true,"legendFormat":"{{topic}}"}],"fieldConfig":{"defaults":{"unit":"short","custom":{"drawStyle":"line","lineInterpolation":"linear","lineWidth":1,"fillOpacity":10,"spanNulls":false,"showPoints":"auto","axisPlacement":"auto","scaleDistribution":{"type":"linear"},"hideFrom":{"legend":false,"tooltip":false,"viz":false}},"mappings":[],"thresholds":{"mode":"absolute","steps":[{"color":"green"}]}},"overrides":[]},"options":{"legend":{"displayMode":"list","placement":"bottom","showLegend":true},"tooltip":{"mode":"multi","sort":"desc"}}},{"id":13,"type":"timeseries","title":"Current topic offset","gridPos":{"h":8,"w":12,"x":0,"y":22},"datasource":{"type":"prometheus","uid":"${ds_datasource}"},"targets":[{"datasource":{"type":"prometheus","uid":"${ds_datasource}"},"editorMode":"code","expr":"sum by (topic) (kafka_topic_partition_current_offset{kafka_instance=\"yc-kafka\",cluster=\"brusnika-stage\",topic=~\"$topic\"})","refId":"A","range":true,"legendFormat":"{{topic}}"}],"fieldConfig":{"defaults":{"unit":"short","custom":{"drawStyle":"line","lineInterpolation":"linear","lineWidth":1,"fillOpacity":10,"spanNulls":false,"showPoints":"auto","axisPlacement":"auto","scaleDistribution":{"type":"linear"},"hideFrom":{"legend":false,"tooltip":false,"viz":false}},"mappings":[],"thresholds":{"mode":"absolute","steps":[{"color":"green"}]}},"overrides":[]},"options":{"legend":{"displayMode":"list","placement":"bottom","showLegend":true},"tooltip":{"mode":"multi","sort":"desc"}}},{"id":14,"type":"bargauge","title":"Partitions by topic","gridPos":{"h":8,"w":12,"x":12,"y":22},"datasource":{"type":"prometheus","uid":"${ds_datasource}"},"targets":[{"datasource":{"type":"prometheus","uid":"${ds_datasource}"},"editorMode":"code","expr":"sum by (topic) (kafka_topic_partitions{kafka_instance=\"yc-kafka\",cluster=\"brusnika-stage\",topic=~\"$topic\"})","refId":"A","range":false,"legendFormat":"{{topic}}","instant":true}],"fieldConfig":{"defaults":{"unit":"short","mappings":[],"thresholds":{"mode":"absolute","steps":[{"color":"green"}]}},"overrides":[]},"options":{"displayMode":"gradient","legend":{"displayMode":"list","placement":"bottom","showLegend":false},"orientation":"horizontal","reduceOptions":{"calcs":["lastNotNull"],"fields":"","values":false},"showUnfilled":true}},{"id":20,"type":"row","title":"Consumer groups","collapsed":false,"gridPos":{"h":1,"w":24,"x":0,"y":30},"panels":[]},{"id":21,"type":"timeseries","title":"Consumer lag by group/topic","gridPos":{"h":8,"w":24,"x":0,"y":31},"datasource":{"type":"prometheus","uid":"${ds_datasource}"},"targets":[{"datasource":{"type":"prometheus","uid":"${ds_datasource}"},"editorMode":"code","expr":"sum by (consumergroup, topic) (kafka_consumergroup_lag_sum{kafka_instance=\"yc-kafka\",cluster=\"brusnika-stage\",topic=~\"$topic\"})","refId":"A","range":true,"legendFormat":"{{consumergroup}} / {{topic}}"}],"fieldConfig":{"defaults":{"unit":"short","custom":{"drawStyle":"line","lineInterpolation":"linear","lineWidth":1,"fillOpacity":10,"spanNulls":false,"showPoints":"auto","axisPlacement":"auto","scaleDistribution":{"type":"linear"},"hideFrom":{"legend":false,"tooltip":false,"viz":false}},"mappings":[],"thresholds":{"mode":"absolute","steps":[{"color":"green"}]}},"overrides":[]},"options":{"legend":{"displayMode":"list","placement":"bottom","showLegend":true},"tooltip":{"mode":"multi","sort":"desc"}}}],"preload":false,"refresh":"30s","schemaVersion":41,"tags":["kafka","kafka-exporter","brusnika-stage"],"templating":{"list":[{"current":{"text":"VictoriaMetrics","value":"vm"},"includeAll":false,"label":"Data Source","name":"ds_datasource","options":[],"query":"prometheus","refresh":1,"regex":"","type":"datasource"},{"current":{"text":"All","value":"$__all"},"datasource":{"type":"prometheus","uid":"${ds_datasource}"},"definition":"label_values(kafka_topic_partition_current_offset{kafka_instance=\"yc-kafka\",cluster=\"brusnika-stage\"},topic)","allValue":".*","includeAll":true,"label":"Topic","name":"topic","options":[],"query":{"qryType":1,"query":"label_values(kafka_topic_partition_current_offset{kafka_instance=\"yc-kafka\",cluster=\"brusnika-stage\"},topic)","refId":"PrometheusVariableQueryEditor-VariableQuery"},"refresh":1,"regex":"","type":"query"}]},"time":{"from":"now-12h","to":"now"},"timepicker":{},"timezone":"","title":"Kafka Exporter / YC Kafka","uid":"kafka-exporter-yc","version":1} + {"annotations":{"list":[{"builtIn":1,"datasource":{"type":"grafana","uid":"-- Grafana --"},"enable":true,"hide":true,"iconColor":"rgba(0, 211, 255, 1)","name":"Annotations & Alerts","type":"dashboard"}]},"editable":true,"fiscalYearStartMonth":0,"graphTooltip":0,"links":[],"panels":[{"id":1,"type":"row","title":"YC Kafka exporter","collapsed":false,"gridPos":{"h":1,"w":24,"x":0,"y":0},"panels":[]},{"id":2,"type":"stat","title":"Exporter up","gridPos":{"h":4,"w":6,"x":0,"y":1},"datasource":{"type":"prometheus","uid":"${ds_datasource}"},"targets":[{"datasource":{"type":"prometheus","uid":"${ds_datasource}"},"editorMode":"code","expr":"max(up{kafka_instance=\"yc-kafka\",cluster=\"brusnika-stage\"})","refId":"A","range":false,"instant":true}],"fieldConfig":{"defaults":{"unit":"short","mappings":[],"thresholds":{"mode":"absolute","steps":[{"color":"red"},{"color":"green","value":1}]}},"overrides":[]},"options":{"colorMode":"background","graphMode":"none","justifyMode":"auto","orientation":"auto","reduceOptions":{"calcs":["lastNotNull"],"fields":"","values":false},"textMode":"auto"}},{"id":3,"type":"stat","title":"Topics without new messages for 12h","gridPos":{"h":4,"w":9,"x":6,"y":1},"datasource":{"type":"prometheus","uid":"${ds_datasource}"},"targets":[{"datasource":{"type":"prometheus","uid":"${ds_datasource}"},"editorMode":"code","expr":"count(((sum by (topic) (max_over_time(kafka_topic_partition_current_offset{kafka_instance=\"yc-kafka\",cluster=\"brusnika-stage\",topic=~\"^bru\\.cde\\.folders\\.stage$\"}[12h]) - min_over_time(kafka_topic_partition_current_offset{kafka_instance=\"yc-kafka\",cluster=\"brusnika-stage\",topic=~\"^bru\\.cde\\.folders\\.stage$\"}[12h])) == 0) and on(topic) (min by (topic) (count_over_time(kafka_topic_partition_current_offset{kafka_instance=\"yc-kafka\",cluster=\"brusnika-stage\",topic=~\"^bru\\.cde\\.folders\\.stage$\"}[12h])) >= 1400) and on(topic) (sum by (topic) (kafka_topic_partitions{kafka_instance=\"yc-kafka\",cluster=\"brusnika-stage\",topic=~\"^bru\\.cde\\.folders\\.stage$\"}) > 0)))","refId":"A","range":false,"instant":true}],"fieldConfig":{"defaults":{"unit":"short","mappings":[],"thresholds":{"mode":"absolute","steps":[{"color":"green"},{"color":"orange","value":1},{"color":"red","value":5}]}},"overrides":[]},"options":{"colorMode":"background","graphMode":"area","justifyMode":"auto","orientation":"auto","reduceOptions":{"calcs":["lastNotNull"],"fields":"","values":false},"textMode":"auto"}},{"id":4,"type":"stat","title":"Known topics","gridPos":{"h":4,"w":9,"x":15,"y":1},"datasource":{"type":"prometheus","uid":"${ds_datasource}"},"targets":[{"datasource":{"type":"prometheus","uid":"${ds_datasource}"},"editorMode":"code","expr":"count(count by (topic) (kafka_topic_partitions{kafka_instance=\"yc-kafka\",cluster=\"brusnika-stage\",topic=~\"^bru\\.cde\\.folders\\.stage$\"}))","refId":"A","range":false,"instant":true}],"fieldConfig":{"defaults":{"unit":"short","mappings":[],"thresholds":{"mode":"absolute","steps":[{"color":"green"}]}},"overrides":[]},"options":{"colorMode":"value","graphMode":"none","justifyMode":"auto","orientation":"auto","reduceOptions":{"calcs":["lastNotNull"],"fields":"","values":false},"textMode":"auto"}},{"id":5,"type":"table","title":"Topics with no offset growth for 12h","gridPos":{"h":8,"w":24,"x":0,"y":5},"datasource":{"type":"prometheus","uid":"${ds_datasource}"},"targets":[{"datasource":{"type":"prometheus","uid":"${ds_datasource}"},"editorMode":"code","expr":"((sum by (topic, kafka_instance, cluster) (max_over_time(kafka_topic_partition_current_offset{kafka_instance=\"yc-kafka\",cluster=\"brusnika-stage\",topic=~\"^bru\\.cde\\.folders\\.stage$\"}[12h]) - min_over_time(kafka_topic_partition_current_offset{kafka_instance=\"yc-kafka\",cluster=\"brusnika-stage\",topic=~\"^bru\\.cde\\.folders\\.stage$\"}[12h])) == 0) and on(topic) (min by (topic) (count_over_time(kafka_topic_partition_current_offset{kafka_instance=\"yc-kafka\",cluster=\"brusnika-stage\",topic=~\"^bru\\.cde\\.folders\\.stage$\"}[12h])) >= 1400) and on(topic) (sum by (topic) (kafka_topic_partitions{kafka_instance=\"yc-kafka\",cluster=\"brusnika-stage\",topic=~\"^bru\\.cde\\.folders\\.stage$\"}) > 0))","refId":"A","range":false,"instant":true}],"fieldConfig":{"defaults":{"custom":{"align":"auto","cellOptions":{"type":"auto"}},"mappings":[],"thresholds":{"mode":"absolute","steps":[{"color":"green"}]}},"overrides":[]},"options":{"showHeader":true,"cellHeight":"sm"},"transformations":[{"id":"labelsToFields","options":{"mode":"columns"}}]},{"id":10,"type":"row","title":"Topic traffic","collapsed":false,"gridPos":{"h":1,"w":24,"x":0,"y":13},"panels":[]},{"id":11,"type":"timeseries","title":"Topic offset delta by $__rate_interval","gridPos":{"h":8,"w":12,"x":0,"y":14},"datasource":{"type":"prometheus","uid":"${ds_datasource}"},"targets":[{"datasource":{"type":"prometheus","uid":"${ds_datasource}"},"editorMode":"code","expr":"sum by (topic) (clamp_min(delta(kafka_topic_partition_current_offset{kafka_instance=\"yc-kafka\",cluster=\"brusnika-stage\",topic=~\"$topic\"}[$__rate_interval]), 0))","refId":"A","range":true,"legendFormat":"{{topic}}"}],"fieldConfig":{"defaults":{"unit":"short","custom":{"drawStyle":"line","lineInterpolation":"linear","lineWidth":1,"fillOpacity":10,"spanNulls":false,"showPoints":"auto","axisPlacement":"auto","scaleDistribution":{"type":"linear"},"hideFrom":{"legend":false,"tooltip":false,"viz":false}},"mappings":[],"thresholds":{"mode":"absolute","steps":[{"color":"green"}]}},"overrides":[]},"options":{"legend":{"displayMode":"list","placement":"bottom","showLegend":true},"tooltip":{"mode":"multi","sort":"desc"}}},{"id":12,"type":"timeseries","title":"Topic offset delta over 12h","gridPos":{"h":8,"w":12,"x":12,"y":14},"datasource":{"type":"prometheus","uid":"${ds_datasource}"},"targets":[{"datasource":{"type":"prometheus","uid":"${ds_datasource}"},"editorMode":"code","expr":"sum by (topic) (max_over_time(kafka_topic_partition_current_offset{kafka_instance=\"yc-kafka\",cluster=\"brusnika-stage\",topic=~\"$topic\"}[12h]) - min_over_time(kafka_topic_partition_current_offset{kafka_instance=\"yc-kafka\",cluster=\"brusnika-stage\",topic=~\"$topic\"}[12h]))","refId":"A","range":true,"legendFormat":"{{topic}}"}],"fieldConfig":{"defaults":{"unit":"short","custom":{"drawStyle":"line","lineInterpolation":"linear","lineWidth":1,"fillOpacity":10,"spanNulls":false,"showPoints":"auto","axisPlacement":"auto","scaleDistribution":{"type":"linear"},"hideFrom":{"legend":false,"tooltip":false,"viz":false}},"mappings":[],"thresholds":{"mode":"absolute","steps":[{"color":"green"}]}},"overrides":[]},"options":{"legend":{"displayMode":"list","placement":"bottom","showLegend":true},"tooltip":{"mode":"multi","sort":"desc"}}},{"id":13,"type":"timeseries","title":"Current topic offset","gridPos":{"h":8,"w":12,"x":0,"y":22},"datasource":{"type":"prometheus","uid":"${ds_datasource}"},"targets":[{"datasource":{"type":"prometheus","uid":"${ds_datasource}"},"editorMode":"code","expr":"sum by (topic) (kafka_topic_partition_current_offset{kafka_instance=\"yc-kafka\",cluster=\"brusnika-stage\",topic=~\"$topic\"})","refId":"A","range":true,"legendFormat":"{{topic}}"}],"fieldConfig":{"defaults":{"unit":"short","custom":{"drawStyle":"line","lineInterpolation":"linear","lineWidth":1,"fillOpacity":10,"spanNulls":false,"showPoints":"auto","axisPlacement":"auto","scaleDistribution":{"type":"linear"},"hideFrom":{"legend":false,"tooltip":false,"viz":false}},"mappings":[],"thresholds":{"mode":"absolute","steps":[{"color":"green"}]}},"overrides":[]},"options":{"legend":{"displayMode":"list","placement":"bottom","showLegend":true},"tooltip":{"mode":"multi","sort":"desc"}}},{"id":14,"type":"bargauge","title":"Partitions by topic","gridPos":{"h":8,"w":12,"x":12,"y":22},"datasource":{"type":"prometheus","uid":"${ds_datasource}"},"targets":[{"datasource":{"type":"prometheus","uid":"${ds_datasource}"},"editorMode":"code","expr":"sum by (topic) (kafka_topic_partitions{kafka_instance=\"yc-kafka\",cluster=\"brusnika-stage\",topic=~\"$topic\"})","refId":"A","range":false,"legendFormat":"{{topic}}","instant":true}],"fieldConfig":{"defaults":{"unit":"short","mappings":[],"thresholds":{"mode":"absolute","steps":[{"color":"green"}]}},"overrides":[]},"options":{"displayMode":"gradient","legend":{"displayMode":"list","placement":"bottom","showLegend":false},"orientation":"horizontal","reduceOptions":{"calcs":["lastNotNull"],"fields":"","values":false},"showUnfilled":true}},{"id":20,"type":"row","title":"Consumer groups","collapsed":false,"gridPos":{"h":1,"w":24,"x":0,"y":30},"panels":[]},{"id":21,"type":"timeseries","title":"Consumer lag by group/topic","gridPos":{"h":8,"w":24,"x":0,"y":31},"datasource":{"type":"prometheus","uid":"${ds_datasource}"},"targets":[{"datasource":{"type":"prometheus","uid":"${ds_datasource}"},"editorMode":"code","expr":"sum by (consumergroup, topic) (kafka_consumergroup_lag_sum{kafka_instance=\"yc-kafka\",cluster=\"brusnika-stage\",topic=~\"$topic\"})","refId":"A","range":true,"legendFormat":"{{consumergroup}} / {{topic}}"}],"fieldConfig":{"defaults":{"unit":"short","custom":{"drawStyle":"line","lineInterpolation":"linear","lineWidth":1,"fillOpacity":10,"spanNulls":false,"showPoints":"auto","axisPlacement":"auto","scaleDistribution":{"type":"linear"},"hideFrom":{"legend":false,"tooltip":false,"viz":false}},"mappings":[],"thresholds":{"mode":"absolute","steps":[{"color":"green"}]}},"overrides":[]},"options":{"legend":{"displayMode":"list","placement":"bottom","showLegend":true},"tooltip":{"mode":"multi","sort":"desc"}}}],"preload":false,"refresh":"30s","schemaVersion":41,"tags":["kafka","kafka-exporter","brusnika-stage"],"templating":{"list":[{"current":{"text":"VictoriaMetrics","value":"vm"},"includeAll":false,"label":"Data Source","name":"ds_datasource","options":[],"query":"prometheus","refresh":1,"regex":"","type":"datasource"},{"current":{"text":"All","value":"$__all"},"datasource":{"type":"prometheus","uid":"${ds_datasource}"},"definition":"label_values(kafka_topic_partition_current_offset{kafka_instance=\"yc-kafka\",cluster=\"brusnika-stage\",topic=~\"^bru\\.cde\\.folders\\.stage$\"},topic)","allValue":".*","includeAll":true,"label":"Topic","name":"topic","options":[],"query":{"qryType":1,"query":"label_values(kafka_topic_partition_current_offset{kafka_instance=\"yc-kafka\",cluster=\"brusnika-stage\",topic=~\"^bru\\.cde\\.folders\\.stage$\"},topic)","refId":"PrometheusVariableQueryEditor-VariableQuery"},"refresh":1,"regex":"","type":"query"}]},"time":{"from":"now-12h","to":"now"},"timepicker":{},"timezone":"","title":"Kafka Exporter / YC Kafka","uid":"kafka-exporter-yc","version":1} diff --git a/clusters/brusnika-stage/infrastructure/kafka-exporter-yc-rules.yaml b/clusters/brusnika-stage/infrastructure/kafka-exporter-yc-rules.yaml index 59517e2..ca23df6 100644 --- a/clusters/brusnika-stage/infrastructure/kafka-exporter-yc-rules.yaml +++ b/clusters/brusnika-stage/infrastructure/kafka-exporter-yc-rules.yaml @@ -16,38 +16,38 @@ spec: team: infra cluster: brusnika-stage kafka_instance: yc-kafka - source_cluster: yc-kafka + source_cluster: brusnika-stage annotations: summary: YC Kafka exporter is down in brusnika-stage description: No healthy kafka-exporter-yc target is scraped for 10 minutes. - - alert: KafkaTopicNoMessagesFor12h + - alert: KafkaTopicNoMessages12h expr: | ( sum by (topic, kafka_instance, source_cluster, cluster) ( - max_over_time(kafka_topic_partition_current_offset{kafka_instance="yc-kafka", cluster="brusnika-stage", topic!~"^__.*"}[12h]) + max_over_time(kafka_topic_partition_current_offset{kafka_instance="yc-kafka", cluster="brusnika-stage", topic=~"^bru\\.cde\\.folders\\.stage$"}[12h]) - - min_over_time(kafka_topic_partition_current_offset{kafka_instance="yc-kafka", cluster="brusnika-stage", topic!~"^__.*"}[12h]) + min_over_time(kafka_topic_partition_current_offset{kafka_instance="yc-kafka", cluster="brusnika-stage", topic=~"^bru\\.cde\\.folders\\.stage$"}[12h]) ) == 0 ) and on (topic, kafka_instance, source_cluster, cluster) ( min by (topic, kafka_instance, source_cluster, cluster) ( - count_over_time(kafka_topic_partition_current_offset{kafka_instance="yc-kafka", cluster="brusnika-stage", topic!~"^__.*"}[12h]) + count_over_time(kafka_topic_partition_current_offset{kafka_instance="yc-kafka", cluster="brusnika-stage", topic=~"^bru\\.cde\\.folders\\.stage$"}[12h]) ) >= 1400 ) and on (topic, kafka_instance, source_cluster, cluster) ( sum by (topic, kafka_instance, source_cluster, cluster) ( - kafka_topic_partitions{kafka_instance="yc-kafka", cluster="brusnika-stage", topic!~"^__.*"} + kafka_topic_partitions{kafka_instance="yc-kafka", cluster="brusnika-stage", topic=~"^bru\\.cde\\.folders\\.stage$"} ) > 0 ) for: 5m labels: - severity: warning + severity: critical team: infra cluster: brusnika-stage kafka_instance: yc-kafka - source_cluster: yc-kafka + source_cluster: brusnika-stage annotations: summary: No new messages in Kafka topic for 12h - description: Topic {{ $labels.topic }} in {{ $labels.kafka_instance }} has no offset growth for 12 hours. + description: Kafka topic {{ $labels.topic }} has no offset growth for 12h in brusnika-stage diff --git a/clusters/brusnika-stage/infrastructure/kustomization.yaml b/clusters/brusnika-stage/infrastructure/kustomization.yaml index fda1e60..5703009 100644 --- a/clusters/brusnika-stage/infrastructure/kustomization.yaml +++ b/clusters/brusnika-stage/infrastructure/kustomization.yaml @@ -21,6 +21,8 @@ resources: - ./node-exporter-vmnodescrape.yaml - ./istio-gateway-stats-scrape.yaml - ./istio-dashboard-compat-vmrule.yaml + - ./camunda-servicemonitors.yaml + - ./rabbitmq-exporter.yaml - ./kafka-exporter-yc.yaml - ./kafka-exporter-yc-rules.yaml - ./kafka-exporter-yc-dashboard.yaml diff --git a/clusters/brusnika-stage/infrastructure/patches/postgres-exporter.yaml b/clusters/brusnika-stage/infrastructure/patches/postgres-exporter.yaml index d50c50c..2fba7ee 100644 --- a/clusters/brusnika-stage/infrastructure/patches/postgres-exporter.yaml +++ b/clusters/brusnika-stage/infrastructure/patches/postgres-exporter.yaml @@ -31,10 +31,12 @@ spec: datasources: - name: attachments uri: 192.168.2.45:5432/attachments_db?sslmode=disable - - name: bim - uri: 192.168.2.45:5432/bimapidb?sslmode=disable + - name: checklists + uri: 192.168.2.45:5432/checklists?sslmode=disable - name: comparisons uri: 192.168.2.45:5432/comparisons_db?sslmode=disable + - name: contracts + uri: 192.168.2.45:5432/contracts?sslmode=disable - name: django uri: 192.168.2.45:5432/sarex_db?sslmode=disable - name: documentations @@ -45,6 +47,10 @@ spec: uri: 192.168.2.45:5432/eav?sslmode=disable - name: flows uri: 192.168.2.45:5432/flows_db?sslmode=disable + - name: gitea + uri: 192.168.2.45:5432/gitea?sslmode=disable + - name: goalert + uri: 192.168.2.45:5432/goalert?sslmode=disable - name: inspections uri: 192.168.2.45:5432/inspections_db?sslmode=disable - name: issues @@ -55,6 +61,8 @@ spec: uri: 192.168.2.45:5432/openobserve?sslmode=disable - name: postgres uri: 192.168.2.45:5432/postgres?sslmode=disable + - name: pm + uri: 192.168.2.45:5432/pm?sslmode=disable - name: resources uri: 192.168.2.45:5432/resources?sslmode=disable - name: rfi diff --git a/clusters/brusnika-stage/infrastructure/rabbitmq-exporter.yaml b/clusters/brusnika-stage/infrastructure/rabbitmq-exporter.yaml new file mode 100644 index 0000000..07f8891 --- /dev/null +++ b/clusters/brusnika-stage/infrastructure/rabbitmq-exporter.yaml @@ -0,0 +1,90 @@ +apiVersion: apps/v1 +kind: Deployment +metadata: + name: rabbitmq-exporter + namespace: workflow + labels: + app: rabbitmq-exporter +spec: + replicas: 1 + selector: + matchLabels: + app: rabbitmq-exporter + template: + metadata: + labels: + app: rabbitmq-exporter + spec: + containers: + - name: rabbitmq-exporter + image: kbudde/rabbitmq-exporter:1.0.0-RC19 + imagePullPolicy: IfNotPresent + env: + - name: RABBIT_URL + value: http://rabbitmq-service.workflow.svc.cluster.local:15672 + - name: RABBIT_USER + valueFrom: + secretKeyRef: + name: rabbitmq-secret + key: username + - name: RABBIT_PASSWORD + valueFrom: + secretKeyRef: + name: rabbitmq-secret + key: password + - name: PUBLISH_PORT + value: "9419" + - name: SKIPVERIFY + value: "true" + ports: + - name: metrics + containerPort: 9419 + resources: + requests: + cpu: 25m + memory: 64Mi + limits: + memory: 128Mi +--- +apiVersion: v1 +kind: Service +metadata: + name: rabbitmq-exporter + namespace: workflow + labels: + app: rabbitmq-exporter +spec: + type: ClusterIP + selector: + app: rabbitmq-exporter + ports: + - name: metrics + port: 9419 + targetPort: metrics +--- +apiVersion: monitoring.coreos.com/v1 +kind: ServiceMonitor +metadata: + name: rabbitmq-exporter + namespace: workflow + labels: + app: rabbitmq-exporter + release: prometheus +spec: + endpoints: + - interval: 30s + path: /metrics + port: metrics + relabelings: + - action: replace + targetLabel: cluster + replacement: brusnika-stage + - action: replace + targetLabel: source_cluster + replacement: brusnika-stage + - action: replace + targetLabel: rabbitmq_instance + replacement: workflow-rabbitmq + selector: + matchLabels: + app: rabbitmq-exporter