Skip to content

Commit

Permalink
fix(charts/authentik): improve monitoring (#145)
Browse files Browse the repository at this point in the history
* fix(charts/authentik): improve monitoring

* Update charts/authentik/templates/prom-service-monitor.yaml

Signed-off-by: Jens L. <jens@beryju.org>

---------

Signed-off-by: Jens L. <jens@beryju.org>
Co-authored-by: Jens L <jens@beryju.org>
Co-authored-by: Jens L <jens@goauthentik.io>
  • Loading branch information
3 people committed Apr 18, 2023
1 parent dd9a5f0 commit e989841
Show file tree
Hide file tree
Showing 5 changed files with 65 additions and 37 deletions.
2 changes: 2 additions & 0 deletions charts/authentik/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -127,9 +127,11 @@ redis:
| postgresql.postgresqlUsername | string | `"authentik"` | |
| priorityClassName | string | `nil` | Custom priority class for different treatment by the scheduler |
| prometheus.rules.create | bool | `false` | |
| prometheus.rules.labels | object | `{}` | labels additional on PrometheusRule |
| prometheus.serviceMonitor.create | bool | `false` | |
| prometheus.serviceMonitor.interval | string | `"30s"` | |
| prometheus.serviceMonitor.scrapeTimeout | string | `"3s"` | |
| prometheus.serviceMonitor.labels | object | `{}` | labels additional on ServiceMonitor |
| readinessProbe.enabled | bool | `true` | |
| readinessProbe.httpGet.path | string | `"/-/health/ready/"` | |
| readinessProbe.httpGet.port | string | `"http"` | |
Expand Down
59 changes: 38 additions & 21 deletions charts/authentik/templates/prom-rules.yaml
Original file line number Diff line number Diff line change
@@ -1,10 +1,14 @@
{{- if .Values.prometheus.rules.create -}}
{{- with .Values.prometheus.rules }}
{{- if .create -}}
apiVersion: monitoring.coreos.com/v1
kind: PrometheusRule
metadata:
name: {{ include "common.names.fullname" . }}
name: {{ include "common.names.fullname" $ }}
labels:
{{- include "common.labels" . | nindent 4 }}
{{- include "common.labels" $ | nindent 4 }}
{{- with .labels }}
{{- toYaml . | nindent 4 }}
{{- end }}
spec:
groups:
- name: authentik Aggregate request counters
Expand Down Expand Up @@ -47,6 +51,7 @@ spec:
expr: sum(rate(django_http_exceptions_total_by_type[30s])) by (job,type)
- record: job:django_http_exceptions_total_by_view:sum_rate30s
expr: sum(rate(django_http_exceptions_total_by_view[30s])) by (job,view)

- name: authentik Aggregate latency histograms
rules:
- record: job:django_http_requests_latency_including_middlewares_seconds:quantile_rate30s
Expand Down Expand Up @@ -81,6 +86,7 @@ spec:
expr: histogram_quantile(0.999, sum(rate(django_http_requests_latency_seconds_bucket[30s])) by (job, le))
labels:
quantile: "99.9"

- name: authentik Aggregate model operations
rules:
- record: job:django_model_inserts_total:sum_rate1m
Expand All @@ -101,48 +107,59 @@ spec:
expr: sum(rate(django_db_execute_many_total[30s])) by (alias, vendor)
- record: job:django_db_errors_total:sum_rate30s
expr: sum(rate(django_db_errors_total[30s])) by (alias, vendor, type)

- name: authentik Aggregate migrations
rules:
- record: job:django_migrations_applied_total:max
expr: max(django_migrations_applied_total) by (job, connection)
- record: job:django_migrations_unapplied_total:max
expr: max(django_migrations_unapplied_total) by (job, connection)

- name: authentik Alerts
rules:
- alert: NoWorkersConnected
labels:
severity: critical
expr: max without (pid) (authentik_admin_workers) < 1
for: 10m
annotations:
message: |
authentik instance {{ printf "{{ $labels.instance }}" }}'s worker are either not running or not connected.
{{`
summary: No workers connected
for: 10m
message: authentik instance {{ $labels.instance }}'s worker are either not running or not connected.
`}}
- alert: PendingMigrations
labels:
severity: critical
- alert: PendingMigrations
expr: max without (pid) (django_migrations_unapplied_total) > 0
for: 10m
annotations:
message: |
authentik instance {{ printf "{{ $labels.instance }}" }} has pending database migrations
{{`
summary: Pending database migrations
for: 10m
message: authentik instance {{ $labels.instance }} has pending database migrations
`}}
- alert: FailedSystemTasks
labels:
severity: critical
- alert: FailedSystemTasks
expr: sum(increase(authentik_system_tasks{status="TaskResultStatus.ERROR"}[2h])) > 0
for: 2h
annotations:
message: |
System task {{ printf "{{ $labels.task_name }}" }} has failed
{{`
summary: Failed system tasks
for: 2h
message: System task {{ $labels.task_name }} has failed
`}}
- alert: DisconnectedOutposts
labels:
severity: critical
- alert: DisconnectedOutposts
expr: sum by (outpost) (max without (pid) (authentik_outposts_connected{uid!~"specific.*"})) < 1
for: 30m
annotations:
message: |
Outpost {{ printf "{{ $labels.outpost }}" }} has at least 1 disconnected instance
{{`
summary: Disconnected outpost
for: 30m
labels:
severity: critical
{{- end }}
message: Outpost {{ $labels.outpost }} has at least 1 disconnected instance
`}}
{{- end }}
{{- end }}
19 changes: 12 additions & 7 deletions charts/authentik/templates/prom-service-monitor.yaml
Original file line number Diff line number Diff line change
@@ -1,16 +1,21 @@
{{- if .Values.prometheus.serviceMonitor.create -}}
{{- with .Values.prometheus.serviceMonitor }}
{{- if .create -}}
apiVersion: monitoring.coreos.com/v1
kind: ServiceMonitor
metadata:
name: {{ include "common.names.fullname" $ }}
labels:
{{- include "common.labels" . | nindent 4 }}
name: {{ include "common.names.fullname" . }}
{{- include "common.labels" $ | nindent 4 }}
{{- with .labels }}
{{- toYaml . | nindent 4 }}
{{- end }}
spec:
endpoints:
- port: http-metrics
scrapeTimeout: {{ .Values.prometheus.serviceMonitor.scrapeTimeout }}
interval: {{ .Values.prometheus.serviceMonitor.interval }}
scrapeTimeout: {{ .scrapeTimeout }}
interval: {{ .interval }}
selector:
matchLabels:
{{- include "common.labels" . | nindent 6 }}
{{- end }}
{{- include "common.labels.selectorLabels" $ | nindent 6 }}
{{- end }}
{{- end }}
18 changes: 9 additions & 9 deletions charts/authentik/templates/service.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -3,13 +3,13 @@ kind: Service
metadata:
name: {{ include "common.names.fullname" . }}
labels:
{{- include "common.labels" . | nindent 4 }}
{{- with .Values.service.labels }}
{{- toYaml . | nindent 4 }}
{{- end }}
{{- with .Values.service.annotations }}
{{- include "common.labels" . | nindent 4 }}
{{- with .Values.service.labels }}
{{- toYaml . | nindent 4 }}
{{- end }}
{{- with .Values.service.annotations }}
annotations:
{{ toYaml . | nindent 4 }}
{{- toYaml . | nindent 4 }}
{{- end }}
spec:
{{- $type := default "ClusterIP" .Values.service.type }}
Expand All @@ -26,20 +26,20 @@ spec:
{{- end }}
{{- with .Values.service.loadBalancerSourceRanges }}
loadBalancerSourceRanges:
{{- toYaml . | nindent 4 }}
{{- toYaml . | nindent 4 }}
{{- end }}
{{- end }}
type: {{ $type }}
{{- if .Values.service.sessionAffinity }}
sessionAffinity: {{ .Values.service.sessionAffinity }}
{{- with .Values.service.sessionAffinityConfig }}
sessionAffinityConfig:
{{- toYaml . | nindent 4 }}
{{- toYaml . | nindent 4 }}
{{- end }}
{{- end }}
{{- with .Values.service.externalIPs }}
externalIPs:
{{- toYaml . | nindent 4 }}
{{- toYaml . | nindent 4 }}
{{- end }}
{{- with .Values.service.publishNotReadyAddresses }}
publishNotReadyAddresses: {{ . }}
Expand Down
4 changes: 4 additions & 0 deletions charts/authentik/values.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -175,8 +175,12 @@ prometheus:
create: false
interval: 30s
scrapeTimeout: 3s
# -- labels additional on ServiceMonitor
labels: {}
rules:
create: false
# -- labels additional on PrometheusRule
labels: {}

geoip:
# -- optional GeoIP, deploys a cronjob to download the maxmind database
Expand Down

0 comments on commit e989841

Please sign in to comment.