iac/clusters/brusnika-prod/infrastructure/kafka-exporter-yc-rules.yaml

54 lines
2.3 KiB
YAML

apiVersion: operator.victoriametrics.com/v1beta1
kind: VMRule
metadata:
name: kafka-exporter-yc-rules
namespace: vmstack
spec:
groups:
- name: kafka-exporter-yc.rules
interval: 5m
rules:
- alert: KafkaExporterYcDown
expr: absent(up{kafka_instance="yc-kafka", cluster="brusnika-prod"} == 1)
for: 10m
labels:
severity: critical
team: infra
cluster: brusnika-prod
kafka_instance: yc-kafka
source_cluster: brusnika-prod
annotations:
summary: YC Kafka exporter is down in brusnika-prod
description: No healthy kafka-exporter-yc target is scraped for 10 minutes.
- alert: KafkaTopicNoMessages12h
expr: |
(
sum by (topic, kafka_instance, source_cluster, cluster) (
max_over_time(kafka_topic_partition_current_offset{kafka_instance="yc-kafka", cluster="brusnika-prod", topic=~"^(bru\\.cde\\.folders\\.prod|system-log-prod)$"}[12h])
-
min_over_time(kafka_topic_partition_current_offset{kafka_instance="yc-kafka", cluster="brusnika-prod", topic=~"^(bru\\.cde\\.folders\\.prod|system-log-prod)$"}[12h])
) == 0
)
and on (topic, kafka_instance, source_cluster, cluster)
(
min by (topic, kafka_instance, source_cluster, cluster) (
count_over_time(kafka_topic_partition_current_offset{kafka_instance="yc-kafka", cluster="brusnika-prod", topic=~"^(bru\\.cde\\.folders\\.prod|system-log-prod)$"}[12h])
) >= 1400
)
and on (topic, kafka_instance, source_cluster, cluster)
(
sum by (topic, kafka_instance, source_cluster, cluster) (
kafka_topic_partitions{kafka_instance="yc-kafka", cluster="brusnika-prod", topic=~"^(bru\\.cde\\.folders\\.prod|system-log-prod)$"}
) > 0
)
for: 5m
labels:
severity: critical
team: infra
cluster: brusnika-prod
kafka_instance: yc-kafka
source_cluster: brusnika-prod
annotations:
summary: No new messages in Kafka topic for 12h
description: Kafka topic {{ $labels.topic }} has no offset growth for 12h in brusnika-prod