diff --git a/clusters/brusnika-stage/infrastructure/kafka-exporter-yc-dashboard.yaml b/clusters/brusnika-stage/infrastructure/kafka-exporter-yc-dashboard.yaml new file mode 100644 index 0000000..5ab4e31 --- /dev/null +++ b/clusters/brusnika-stage/infrastructure/kafka-exporter-yc-dashboard.yaml @@ -0,0 +1,12 @@ +apiVersion: v1 +kind: ConfigMap +metadata: + name: kafka-exporter-yc-dashboard + namespace: prometheus-stack + labels: + grafana_dashboard: "1" + annotations: + grafana_folder: Kafka +data: + kafka-exporter-yc.json: |- + {"annotations":{"list":[{"builtIn":1,"datasource":{"type":"grafana","uid":"-- Grafana --"},"enable":true,"hide":true,"iconColor":"rgba(0, 211, 255, 1)","name":"Annotations & Alerts","type":"dashboard"}]},"editable":true,"fiscalYearStartMonth":0,"graphTooltip":0,"links":[],"panels":[{"id":1,"type":"row","title":"YC Kafka exporter","collapsed":false,"gridPos":{"h":1,"w":24,"x":0,"y":0},"panels":[]},{"id":2,"type":"stat","title":"Exporter up","gridPos":{"h":4,"w":6,"x":0,"y":1},"datasource":{"type":"prometheus","uid":"${ds_datasource}"},"targets":[{"datasource":{"type":"prometheus","uid":"${ds_datasource}"},"editorMode":"code","expr":"max(up{kafka_instance=\"yc-kafka\",cluster=\"brusnika-stage\"})","refId":"A","range":false,"instant":true}],"fieldConfig":{"defaults":{"unit":"short","mappings":[],"thresholds":{"mode":"absolute","steps":[{"color":"red"},{"color":"green","value":1}]}},"overrides":[]},"options":{"colorMode":"background","graphMode":"none","justifyMode":"auto","orientation":"auto","reduceOptions":{"calcs":["lastNotNull"],"fields":"","values":false},"textMode":"auto"}},{"id":3,"type":"stat","title":"Topics without new messages for 12h","gridPos":{"h":4,"w":9,"x":6,"y":1},"datasource":{"type":"prometheus","uid":"${ds_datasource}"},"targets":[{"datasource":{"type":"prometheus","uid":"${ds_datasource}"},"editorMode":"code","expr":"count((sum by (topic) (max_over_time(kafka_topic_partition_current_offset{kafka_instance=\"yc-kafka\",cluster=\"brusnika-stage\",topic!~\"^__.*\"}[12h]) - min_over_time(kafka_topic_partition_current_offset{kafka_instance=\"yc-kafka\",cluster=\"brusnika-stage\",topic!~\"^__.*\"}[12h])) == 0) and on(topic) (sum by (topic) (kafka_topic_partitions{kafka_instance=\"yc-kafka\",cluster=\"brusnika-stage\",topic!~\"^__.*\"}) > 0))","refId":"A","range":false,"instant":true}],"fieldConfig":{"defaults":{"unit":"short","mappings":[],"thresholds":{"mode":"absolute","steps":[{"color":"green"},{"color":"orange","value":1},{"color":"red","value":5}]}},"overrides":[]},"options":{"colorMode":"background","graphMode":"area","justifyMode":"auto","orientation":"auto","reduceOptions":{"calcs":["lastNotNull"],"fields":"","values":false},"textMode":"auto"}},{"id":4,"type":"stat","title":"Known topics","gridPos":{"h":4,"w":9,"x":15,"y":1},"datasource":{"type":"prometheus","uid":"${ds_datasource}"},"targets":[{"datasource":{"type":"prometheus","uid":"${ds_datasource}"},"editorMode":"code","expr":"count(count by (topic) (kafka_topic_partitions{kafka_instance=\"yc-kafka\",cluster=\"brusnika-stage\",topic!~\"^__.*\"}))","refId":"A","range":false,"instant":true}],"fieldConfig":{"defaults":{"unit":"short","mappings":[],"thresholds":{"mode":"absolute","steps":[{"color":"green"}]}},"overrides":[]},"options":{"colorMode":"value","graphMode":"none","justifyMode":"auto","orientation":"auto","reduceOptions":{"calcs":["lastNotNull"],"fields":"","values":false},"textMode":"auto"}},{"id":5,"type":"table","title":"Topics with no offset growth for 12h","gridPos":{"h":8,"w":24,"x":0,"y":5},"datasource":{"type":"prometheus","uid":"${ds_datasource}"},"targets":[{"datasource":{"type":"prometheus","uid":"${ds_datasource}"},"editorMode":"code","expr":"(sum by (topic, kafka_instance, cluster) (max_over_time(kafka_topic_partition_current_offset{kafka_instance=\"yc-kafka\",cluster=\"brusnika-stage\",topic!~\"^__.*\"}[12h]) - min_over_time(kafka_topic_partition_current_offset{kafka_instance=\"yc-kafka\",cluster=\"brusnika-stage\",topic!~\"^__.*\"}[12h])) == 0) and on(topic) (sum by (topic) (kafka_topic_partitions{kafka_instance=\"yc-kafka\",cluster=\"brusnika-stage\",topic!~\"^__.*\"}) > 0)","refId":"A","range":false,"instant":true}],"fieldConfig":{"defaults":{"custom":{"align":"auto","cellOptions":{"type":"auto"}},"mappings":[],"thresholds":{"mode":"absolute","steps":[{"color":"green"}]}},"overrides":[]},"options":{"showHeader":true,"cellHeight":"sm"},"transformations":[{"id":"labelsToFields","options":{"mode":"columns"}}]},{"id":10,"type":"row","title":"Topic traffic","collapsed":false,"gridPos":{"h":1,"w":24,"x":0,"y":13},"panels":[]},{"id":11,"type":"timeseries","title":"Topic offset delta by $__rate_interval","gridPos":{"h":8,"w":12,"x":0,"y":14},"datasource":{"type":"prometheus","uid":"${ds_datasource}"},"targets":[{"datasource":{"type":"prometheus","uid":"${ds_datasource}"},"editorMode":"code","expr":"sum by (topic) (clamp_min(delta(kafka_topic_partition_current_offset{kafka_instance=\"yc-kafka\",cluster=\"brusnika-stage\",topic=~\"$topic\"}[$__rate_interval]), 0))","refId":"A","range":true,"legendFormat":"{{topic}}"}],"fieldConfig":{"defaults":{"unit":"short","custom":{"drawStyle":"line","lineInterpolation":"linear","lineWidth":1,"fillOpacity":10,"spanNulls":false,"showPoints":"auto","axisPlacement":"auto","scaleDistribution":{"type":"linear"},"hideFrom":{"legend":false,"tooltip":false,"viz":false}},"mappings":[],"thresholds":{"mode":"absolute","steps":[{"color":"green"}]}},"overrides":[]},"options":{"legend":{"displayMode":"list","placement":"bottom","showLegend":true},"tooltip":{"mode":"multi","sort":"desc"}}},{"id":12,"type":"timeseries","title":"Topic offset delta over 12h","gridPos":{"h":8,"w":12,"x":12,"y":14},"datasource":{"type":"prometheus","uid":"${ds_datasource}"},"targets":[{"datasource":{"type":"prometheus","uid":"${ds_datasource}"},"editorMode":"code","expr":"sum by (topic) (max_over_time(kafka_topic_partition_current_offset{kafka_instance=\"yc-kafka\",cluster=\"brusnika-stage\",topic=~\"$topic\"}[12h]) - min_over_time(kafka_topic_partition_current_offset{kafka_instance=\"yc-kafka\",cluster=\"brusnika-stage\",topic=~\"$topic\"}[12h]))","refId":"A","range":true,"legendFormat":"{{topic}}"}],"fieldConfig":{"defaults":{"unit":"short","custom":{"drawStyle":"line","lineInterpolation":"linear","lineWidth":1,"fillOpacity":10,"spanNulls":false,"showPoints":"auto","axisPlacement":"auto","scaleDistribution":{"type":"linear"},"hideFrom":{"legend":false,"tooltip":false,"viz":false}},"mappings":[],"thresholds":{"mode":"absolute","steps":[{"color":"green"}]}},"overrides":[]},"options":{"legend":{"displayMode":"list","placement":"bottom","showLegend":true},"tooltip":{"mode":"multi","sort":"desc"}}},{"id":13,"type":"timeseries","title":"Current topic offset","gridPos":{"h":8,"w":12,"x":0,"y":22},"datasource":{"type":"prometheus","uid":"${ds_datasource}"},"targets":[{"datasource":{"type":"prometheus","uid":"${ds_datasource}"},"editorMode":"code","expr":"sum by (topic) (kafka_topic_partition_current_offset{kafka_instance=\"yc-kafka\",cluster=\"brusnika-stage\",topic=~\"$topic\"})","refId":"A","range":true,"legendFormat":"{{topic}}"}],"fieldConfig":{"defaults":{"unit":"short","custom":{"drawStyle":"line","lineInterpolation":"linear","lineWidth":1,"fillOpacity":10,"spanNulls":false,"showPoints":"auto","axisPlacement":"auto","scaleDistribution":{"type":"linear"},"hideFrom":{"legend":false,"tooltip":false,"viz":false}},"mappings":[],"thresholds":{"mode":"absolute","steps":[{"color":"green"}]}},"overrides":[]},"options":{"legend":{"displayMode":"list","placement":"bottom","showLegend":true},"tooltip":{"mode":"multi","sort":"desc"}}},{"id":14,"type":"bargauge","title":"Partitions by topic","gridPos":{"h":8,"w":12,"x":12,"y":22},"datasource":{"type":"prometheus","uid":"${ds_datasource}"},"targets":[{"datasource":{"type":"prometheus","uid":"${ds_datasource}"},"editorMode":"code","expr":"sum by (topic) (kafka_topic_partitions{kafka_instance=\"yc-kafka\",cluster=\"brusnika-stage\",topic=~\"$topic\"})","refId":"A","range":false,"legendFormat":"{{topic}}","instant":true}],"fieldConfig":{"defaults":{"unit":"short","mappings":[],"thresholds":{"mode":"absolute","steps":[{"color":"green"}]}},"overrides":[]},"options":{"displayMode":"gradient","legend":{"displayMode":"list","placement":"bottom","showLegend":false},"orientation":"horizontal","reduceOptions":{"calcs":["lastNotNull"],"fields":"","values":false},"showUnfilled":true}},{"id":20,"type":"row","title":"Consumer groups","collapsed":false,"gridPos":{"h":1,"w":24,"x":0,"y":30},"panels":[]},{"id":21,"type":"timeseries","title":"Consumer lag by group/topic","gridPos":{"h":8,"w":24,"x":0,"y":31},"datasource":{"type":"prometheus","uid":"${ds_datasource}"},"targets":[{"datasource":{"type":"prometheus","uid":"${ds_datasource}"},"editorMode":"code","expr":"sum by (consumergroup, topic) (kafka_consumergroup_lag_sum{kafka_instance=\"yc-kafka\",cluster=\"brusnika-stage\",topic=~\"$topic\"})","refId":"A","range":true,"legendFormat":"{{consumergroup}} / {{topic}}"}],"fieldConfig":{"defaults":{"unit":"short","custom":{"drawStyle":"line","lineInterpolation":"linear","lineWidth":1,"fillOpacity":10,"spanNulls":false,"showPoints":"auto","axisPlacement":"auto","scaleDistribution":{"type":"linear"},"hideFrom":{"legend":false,"tooltip":false,"viz":false}},"mappings":[],"thresholds":{"mode":"absolute","steps":[{"color":"green"}]}},"overrides":[]},"options":{"legend":{"displayMode":"list","placement":"bottom","showLegend":true},"tooltip":{"mode":"multi","sort":"desc"}}}],"preload":false,"refresh":"30s","schemaVersion":41,"tags":["kafka","kafka-exporter","brusnika-stage"],"templating":{"list":[{"current":{"text":"VictoriaMetrics","value":"vm"},"includeAll":false,"label":"Data Source","name":"ds_datasource","options":[],"query":"prometheus","refresh":1,"regex":"","type":"datasource"},{"current":{"text":"All","value":"$__all"},"datasource":{"type":"prometheus","uid":"${ds_datasource}"},"definition":"label_values(kafka_topic_partition_current_offset{kafka_instance=\"yc-kafka\",cluster=\"brusnika-stage\"},topic)","allValue":".*","includeAll":true,"label":"Topic","name":"topic","options":[],"query":{"qryType":1,"query":"label_values(kafka_topic_partition_current_offset{kafka_instance=\"yc-kafka\",cluster=\"brusnika-stage\"},topic)","refId":"PrometheusVariableQueryEditor-VariableQuery"},"refresh":1,"regex":"","type":"query"}]},"time":{"from":"now-12h","to":"now"},"timepicker":{},"timezone":"","title":"Kafka Exporter / YC Kafka","uid":"kafka-exporter-yc","version":1} diff --git a/clusters/brusnika-stage/infrastructure/kafka-exporter-yc-rules.yaml b/clusters/brusnika-stage/infrastructure/kafka-exporter-yc-rules.yaml new file mode 100644 index 0000000..451f469 --- /dev/null +++ b/clusters/brusnika-stage/infrastructure/kafka-exporter-yc-rules.yaml @@ -0,0 +1,47 @@ +apiVersion: operator.victoriametrics.com/v1beta1 +kind: VMRule +metadata: + name: kafka-exporter-yc-rules + namespace: vmstack +spec: + groups: + - name: kafka-exporter-yc.rules + interval: 5m + rules: + - alert: KafkaExporterYcDown + expr: absent(up{kafka_instance="yc-kafka", cluster="brusnika-stage"} == 1) + for: 10m + labels: + severity: critical + team: infra + cluster: brusnika-stage + kafka_instance: yc-kafka + source_cluster: yc-kafka + annotations: + summary: YC Kafka exporter is down in brusnika-stage + description: No healthy kafka-exporter-yc target is scraped for 10 minutes. + - alert: KafkaTopicNoMessagesFor12h + expr: | + ( + sum by (topic, kafka_instance, source_cluster, cluster) ( + max_over_time(kafka_topic_partition_current_offset{kafka_instance="yc-kafka", cluster="brusnika-stage", topic!~"^__.*"}[12h]) + - + min_over_time(kafka_topic_partition_current_offset{kafka_instance="yc-kafka", cluster="brusnika-stage", topic!~"^__.*"}[12h]) + ) == 0 + ) + and on (topic, kafka_instance, source_cluster, cluster) + ( + sum by (topic, kafka_instance, source_cluster, cluster) ( + kafka_topic_partitions{kafka_instance="yc-kafka", cluster="brusnika-stage", topic!~"^__.*"} + ) > 0 + ) + for: 5m + labels: + severity: warning + team: infra + cluster: brusnika-stage + kafka_instance: yc-kafka + source_cluster: yc-kafka + annotations: + summary: No new messages in Kafka topic for 12h + description: Topic {{ $labels.topic }} in {{ $labels.kafka_instance }} has no offset growth for 12 hours. diff --git a/clusters/brusnika-stage/infrastructure/kafka-exporter-yc.yaml b/clusters/brusnika-stage/infrastructure/kafka-exporter-yc.yaml new file mode 100644 index 0000000..b2cb0c0 --- /dev/null +++ b/clusters/brusnika-stage/infrastructure/kafka-exporter-yc.yaml @@ -0,0 +1,159 @@ +apiVersion: v1 +kind: ServiceAccount +metadata: + name: kafka-exporter-yc + namespace: kafka-exporter +--- +apiVersion: helm.toolkit.fluxcd.io/v2 +kind: HelmRelease +metadata: + name: kafka-exporter-yc + namespace: kafka-exporter +spec: + dependsOn: + - name: prometheus-stack + namespace: prometheus-stack + - name: vault + namespace: vault + interval: 5m + timeout: 10m + chart: + spec: + chart: kafka-exporter-prod + version: "0.27.0" + sourceRef: + kind: HelmRepository + name: yc-oci-charts + namespace: flux-system + interval: 10m + install: + remediation: + retries: 3 + upgrade: + remediation: + retries: 3 + postRenderers: + - kustomize: + patches: + - target: + group: apps + version: v1 + kind: Deployment + name: kafka-exporter-yc + patch: |- + - op: add + path: /spec/template/metadata/annotations + value: + vault.hashicorp.com/agent-init-first: "true" + vault.hashicorp.com/agent-inject: "true" + vault.hashicorp.com/agent-pre-populate-only: "true" + vault.hashicorp.com/auth-path: auth/kubernetes + vault.hashicorp.com/role: kafka-exporter-yc + vault.hashicorp.com/agent-inject-secret-kafka-bootstrap: secrets/data/vault/apps/kafka-exporter-yc + vault.hashicorp.com/agent-inject-template-kafka-bootstrap: |- + {{- with secret "secrets/data/vault/apps/kafka-exporter-yc" -}} + {{ index .Data.data "KAFKA_BOOTSTRAP" }} + {{- end -}} + vault.hashicorp.com/agent-inject-secret-kafka-user: secrets/data/vault/apps/kafka-exporter-yc + vault.hashicorp.com/agent-inject-template-kafka-user: |- + {{- with secret "secrets/data/vault/apps/kafka-exporter-yc" -}} + {{ index .Data.data "KAFKA_USER" }} + {{- end -}} + vault.hashicorp.com/agent-inject-secret-kafka-password: secrets/data/vault/apps/kafka-exporter-yc + vault.hashicorp.com/agent-inject-template-kafka-password: |- + {{- with secret "secrets/data/vault/apps/kafka-exporter-yc" -}} + {{ index .Data.data "KAFKA_PASSWORD" }} + {{- end -}} + vault.hashicorp.com/agent-inject-secret-kafka-ca.pem: secrets/data/vault/apps/kafka-exporter-yc + vault.hashicorp.com/agent-inject-template-kafka-ca.pem: |- + {{- with secret "secrets/data/vault/apps/kafka-exporter-yc" -}} + {{ index .Data.data "KAFKA_CA_PEM" }} + {{- end -}} + - op: add + path: /spec/template/spec/serviceAccountName + value: kafka-exporter-yc + - op: add + path: /spec/template/spec/imagePullSecrets + value: + - name: regcred + - op: add + path: /spec/template/spec/containers/0/command + value: + - /bin/sh + - -ec + - op: replace + path: /spec/template/spec/containers/0/args + value: + - |- + KAFKA_BOOTSTRAP="$(cat /vault/secrets/kafka-bootstrap)" + KAFKA_USER="$(cat /vault/secrets/kafka-user)" + KAFKA_PASSWORD="$(cat /vault/secrets/kafka-password)" + if command -v kafka_exporter >/dev/null 2>&1; then + KAFKA_EXPORTER_BIN="$(command -v kafka_exporter)" + else + KAFKA_EXPORTER_BIN=/bin/kafka_exporter + fi + OLD_IFS="${IFS}" + IFS=, + set -- + for broker in ${KAFKA_BOOTSTRAP}; do + broker="$(printf '%s' "${broker}" | tr -d '[:space:]')" + if [ -n "${broker}" ]; then + set -- "$@" --kafka.server="${broker}" + fi + done + IFS="${OLD_IFS}" + exec "${KAFKA_EXPORTER_BIN}" \ + "$@" \ + --sasl.enabled \ + --sasl.username="${KAFKA_USER}" \ + --sasl.password="${KAFKA_PASSWORD}" \ + --sasl.mechanism=scram-sha512 \ + --tls.enabled \ + --tls.ca-file=/vault/secrets/kafka-ca.pem \ + --kafka.labels=yc-kafka \ + --topic.exclude='^__.*' \ + --verbosity=0 + - target: + group: monitoring.coreos.com + version: v1 + kind: ServiceMonitor + name: kafka-exporter-yc + patch: |- + - op: add + path: /spec/selector/matchLabels/app.kubernetes.io~1instance + value: kafka-exporter-yc + values: + fullnameOverride: kafka-exporter-yc + image: + repository: danielqsj/kafka-exporter + tag: latest + pullPolicy: IfNotPresent + kafkaExporter: + kafka: + servers: + - kafka-bootstrap.from-vault.invalid:9091 + sasl: + enabled: false + tls: + enabled: false + prometheus: + serviceMonitor: + enabled: true + namespace: kafka-exporter + interval: 30s + additionalLabels: + app: kafka-exporter-yc + metricRelabelings: + - action: replace + targetLabel: kafka_instance + replacement: yc-kafka + - action: replace + targetLabel: source_cluster + replacement: yc-kafka + - action: replace + targetLabel: monitored_cluster + replacement: yc-kafka + - action: replace + targetLabel: cluster + replacement: brusnika-stage diff --git a/clusters/brusnika-stage/infrastructure/kustomization.yaml b/clusters/brusnika-stage/infrastructure/kustomization.yaml index e153780..fda1e60 100644 --- a/clusters/brusnika-stage/infrastructure/kustomization.yaml +++ b/clusters/brusnika-stage/infrastructure/kustomization.yaml @@ -21,6 +21,9 @@ resources: - ./node-exporter-vmnodescrape.yaml - ./istio-gateway-stats-scrape.yaml - ./istio-dashboard-compat-vmrule.yaml + - ./kafka-exporter-yc.yaml + - ./kafka-exporter-yc-rules.yaml + - ./kafka-exporter-yc-dashboard.yaml - ../../../infrastructure/failed-pod-cleanup patches: - path: ./patches/istio-gateway.yaml diff --git a/clusters/brusnika-stage/infrastructure/patches/kafka-exporter.yaml b/clusters/brusnika-stage/infrastructure/patches/kafka-exporter.yaml index 1e02e59..ec764d3 100644 --- a/clusters/brusnika-stage/infrastructure/patches/kafka-exporter.yaml +++ b/clusters/brusnika-stage/infrastructure/patches/kafka-exporter.yaml @@ -22,6 +22,15 @@ spec: path: /spec/template/spec/imagePullSecrets value: - name: regcred + - target: + group: monitoring.coreos.com + version: v1 + kind: ServiceMonitor + name: kafka-exporter-kafka-exporter-prod + patch: |- + - op: add + path: /spec/selector/matchLabels/app.kubernetes.io~1instance + value: kafka-exporter values: image: repository: danielqsj/kafka-exporter