/
alerts.yaml
59 lines (59 loc) · 2.2 KB
/
alerts.yaml
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
---
apiVersion: monitoring.coreos.com/v1
kind: PrometheusRule
metadata:
name: {{ printf "%s-%s" (include "promtail-mixin.fullname" .) "alerts" | trunc 63 | trimSuffix "-" }}
annotations:
{{ include "promtail-mixin.annotations" . | indent 4 }}
{{- if .Values.additionalAnnotations }}
{{ toYaml .Values.additionalAnnotations | indent 4 }}
{{- end }}
labels:
app: {{ include "promtail-mixin.name" . }}
{{ include "promtail-mixin.labels" . | indent 4 }}
{{- if .Values.additionalLabels }}
{{ toYaml .Values.additionalLabels | indent 4 }}
{{- end }}
spec:
groups:
- name: promtail_alerts
rules:
- alert: PromtailRequestsErrors
annotations:
message: |
{{`{{`}} $labels.job {{`}}`}} {{`{{`}} $labels.route {{`}}`}} is experiencing {{`{{`}} printf "%.2f" $value {{`}}`}}% errors.
expr: |
100 * sum(rate(promtail_request_duration_seconds_count{status_code=~"5..|failed"}[1m])) by (namespace, job, route, instance)
/
sum(rate(promtail_request_duration_seconds_count[1m])) by (namespace, job, route, instance)
> 10
for: 15m
labels:
severity: critical
- alert: PromtailRequestLatency
annotations:
message: |
{{`{{`}} $labels.job {{`}}`}} {{`{{`}} $labels.route {{`}}`}} is experiencing {{`{{`}} printf "%.2f" $value {{`}}`}}s 99th percentile latency.
expr: |
job_status_code_namespace:promtail_request_duration_seconds:99quantile > 1
for: 15m
labels:
severity: critical
- alert: PromtailFileLagging
annotations:
message: |
{{`{{`}} $labels.instance {{`}}`}} {{`{{`}} $labels.job {{`}}`}} {{`{{`}} $labels.path {{`}}`}} has been lagging by more than 1MB for more than 15m.
expr: |
abs(promtail_file_bytes_total - promtail_read_bytes_total) > 1e6
for: 15m
labels:
severity: warning
- alert: PromtailFileMissing
annotations:
message: |
{{`{{`}} $labels.instance {{`}}`}} {{`{{`}} $labels.job {{`}}`}} {{`{{`}} $labels.path {{`}}`}} matches the glob but is not being tailed.
expr: |
promtail_file_bytes_total unless promtail_read_bytes_total
for: 15m
labels:
severity: critical