diff --git a/charts/thanos-mixin/Chart.yaml b/charts/thanos-mixin/Chart.yaml index ac104508..466fe459 100644 --- a/charts/thanos-mixin/Chart.yaml +++ b/charts/thanos-mixin/Chart.yaml @@ -28,12 +28,12 @@ keywords: # This is the chart version. This version number should be incremented each time you make changes # to the chart and its templates, including the app version. # Versions are expected to follow Semantic Versioning (https://semver.org/) -version: 0.11.1 +version: 0.12.0 # This is the version number of the application being deployed. This version number should be # incremented each time you make changes to the application. Versions are not expected to # follow Semantic Versioning. They should reflect the version the application is using. -appVersion: 0.18.0 +appVersion: 0.21.0 maintainers: - name: nlamirault diff --git a/charts/thanos-mixin/README.md b/charts/thanos-mixin/README.md index 9b0bc9d0..08e2c98c 100644 --- a/charts/thanos-mixin/README.md +++ b/charts/thanos-mixin/README.md @@ -1,6 +1,6 @@ # thanos-mixin -![Version: 0.11.1](https://img.shields.io/badge/Version-0.11.1-informational?style=flat-square) ![Type: application](https://img.shields.io/badge/Type-application-informational?style=flat-square) ![AppVersion: 0.18.0](https://img.shields.io/badge/AppVersion-0.18.0-informational?style=flat-square) +![Version: 0.12.0](https://img.shields.io/badge/Version-0.12.0-informational?style=flat-square) ![Type: application](https://img.shields.io/badge/Type-application-informational?style=flat-square) ![AppVersion: 0.21.0](https://img.shields.io/badge/AppVersion-0.21.0-informational?style=flat-square) A Helm chart for Thanos Mixin diff --git a/charts/thanos-mixin/dashboards/bucket_replicate.json b/charts/thanos-mixin/dashboards/bucket_replicate.json new file mode 100644 index 00000000..db7de7b1 --- /dev/null +++ b/charts/thanos-mixin/dashboards/bucket_replicate.json @@ -0,0 +1,512 @@ +{ + "annotations": { + "list": [ ] + }, + "editable": true, + "gnetId": null, + "graphTooltip": 0, + "hideControls": false, + "links": [ ], + "refresh": "10s", + "rows": [ + { + "collapse": false, + "height": "250px", + "panels": [ + { + "aliasColors": { + "error": "#E24D42" + }, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "$datasource", + "fill": 10, + "id": 1, + "legend": { + "avg": false, + "current": false, + "max": false, + "min": false, + "show": true, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 0, + "links": [ ], + "nullPointMode": "null as zero", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [ ], + "spaceLength": 10, + "span": 4, + "stack": true, + "steppedLine": false, + "targets": [ + { + "expr": "sum by (job) (rate(thanos_replicate_replication_runs_total{result=\"error\", job=\"$job\"}[$interval])) / sum by (job) (rate(thanos_replicate_replication_runs_total{job=\"$job\"}[$interval]))", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "error", + "step": 10 + } + ], + "thresholds": [ ], + "timeFrom": null, + "timeShift": null, + "title": "Rate", + "tooltip": { + "shared": false, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [ ] + }, + "yaxes": [ + { + "format": "percentunit", + "label": null, + "logBase": 1, + "max": null, + "min": 0, + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": false + } + ] + }, + { + "aliasColors": { }, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "$datasource", + "description": "Shows rate of errors.", + "fill": 10, + "id": 2, + "legend": { + "avg": false, + "current": false, + "max": false, + "min": false, + "show": true, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 0, + "links": [ ], + "nullPointMode": "null as zero", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [ ], + "spaceLength": 10, + "span": 4, + "stack": true, + "steppedLine": false, + "targets": [ + { + "expr": "sum by (job, result) (rate(thanos_replicate_replication_runs_total{result=\"error\", job=\"$job\"}[$interval]))", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "{{result}}", + "legendLink": null, + "step": 10 + } + ], + "thresholds": [ ], + "timeFrom": null, + "timeShift": null, + "title": "Errors", + "tooltip": { + "shared": false, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [ ] + }, + "yaxes": [ + { + "format": "percentunit", + "label": null, + "logBase": 1, + "max": null, + "min": 0, + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": false + } + ] + }, + { + "aliasColors": { }, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "$datasource", + "description": "Shows how long has it taken to run a replication cycle.", + "fill": 1, + "id": 3, + "legend": { + "avg": false, + "current": false, + "max": false, + "min": false, + "show": true, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 1, + "links": [ ], + "nullPointMode": "null as zero", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [ + { + "alias": "p99", + "color": "#FA6400", + "fill": 1, + "fillGradient": 1 + }, + { + "alias": "p90", + "color": "#E0B400", + "fill": 1, + "fillGradient": 1 + }, + { + "alias": "p50", + "color": "#37872D", + "fill": 10, + "fillGradient": 0 + } + ], + "spaceLength": 10, + "span": 4, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "histogram_quantile(0.50, sum by (job, le) (rate(thanos_replicate_replication_run_duration_seconds_bucket{result=\"success\", job=\"$job\"}[$interval]))) * 1", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "p50 {{job}}", + "logBase": 10, + "max": null, + "min": null, + "step": 10 + }, + { + "expr": "histogram_quantile(0.90, sum by (job, le) (rate(thanos_replicate_replication_run_duration_seconds_bucket{result=\"success\", job=\"$job\"}[$interval]))) * 1", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "p90 {{job}}", + "logBase": 10, + "max": null, + "min": null, + "step": 10 + }, + { + "expr": "histogram_quantile(0.99, sum by (job, le) (rate(thanos_replicate_replication_run_duration_seconds_bucket{result=\"success\", job=\"$job\"}[$interval]))) * 1", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "p99 {{job}}", + "logBase": 10, + "max": null, + "min": null, + "step": 10 + } + ], + "thresholds": [ ], + "timeFrom": null, + "timeShift": null, + "title": "Duration", + "tooltip": { + "shared": false, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [ ] + }, + "yaxes": [ + { + "format": "s", + "label": null, + "logBase": 1, + "max": null, + "min": 0, + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": false + } + ] + } + ], + "repeat": null, + "repeatIteration": null, + "repeatRowId": null, + "showTitle": true, + "title": "Bucket Replicate Runs", + "titleSize": "h6" + }, + { + "collapse": false, + "height": "250px", + "panels": [ + { + "aliasColors": { }, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "$datasource", + "fill": 1, + "id": 4, + "legend": { + "avg": false, + "current": false, + "max": false, + "min": false, + "show": true, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 1, + "links": [ ], + "nullPointMode": "null as zero", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [ ], + "spaceLength": 10, + "span": 12, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "sum by (job) (rate(blocks_meta_synced{state=\"loaded\", job=\"$job\"}[$interval]))", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "meta loads", + "legendLink": null, + "step": 10 + }, + { + "expr": "sum by (job) (rate(blocks_meta_synced{state=\"failed\", job=\"$job\"}[$interval]))", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "partial meta reads", + "legendLink": null, + "step": 10 + }, + { + "expr": "sum by (job) (rate(thanos_replicate_blocks_already_replicated_total{job=\"$job\"}[$interval]))", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "already replicated blocks", + "legendLink": null, + "step": 10 + }, + { + "expr": "sum by (job) (rate(thanos_replicate_blocks_replicated_total{job=\"$job\"}[$interval]))", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "replicated blocks", + "legendLink": null, + "step": 10 + }, + { + "expr": "sum by (job) (rate(thanos_replicate_objects_replicated_total{job=\"$job\"}[$interval]))", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "replicated objects", + "legendLink": null, + "step": 10 + } + ], + "thresholds": [ ], + "timeFrom": null, + "timeShift": null, + "title": "Metrics", + "tooltip": { + "shared": false, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [ ] + }, + "yaxes": [ + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": 0, + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": false + } + ] + } + ], + "repeat": null, + "repeatIteration": null, + "repeatRowId": null, + "showTitle": true, + "title": "Bucket Replication", + "titleSize": "h6" + } + ], + "schemaVersion": 14, + "style": "dark", + "tags": [ + "thanos-mixin", + "portefaix-v0.6.0" + ], + "templating": { + "list": [ + { + "current": { + "text": "default", + "value": "default" + }, + "hide": 0, + "label": null, + "name": "datasource", + "options": [ ], + "query": "prometheus", + "refresh": 1, + "regex": "", + "type": "datasource" + }, + { + "auto": true, + "auto_count": 300, + "auto_min": "10s", + "current": { + "text": "5m", + "value": "5m" + }, + "hide": 0, + "label": "interval", + "name": "interval", + "query": "5m,10m,30m,1h,6h,12h", + "refresh": 2, + "type": "interval" + }, + { + "allValue": null, + "current": { + "text": "all", + "value": "$__all" + }, + "datasource": "$datasource", + "hide": 0, + "includeAll": true, + "label": "job", + "multi": false, + "name": "job", + "options": [ ], + "query": "label_values(up{job=~\".*thanos-bucket-replicate.*\"}, job)", + "refresh": 1, + "regex": "", + "sort": 2, + "tagValuesQuery": "", + "tags": [ ], + "tagsQuery": "", + "type": "query", + "useTags": false + } + ] + }, + "time": { + "from": "now-1h", + "to": "now" + }, + "timepicker": { + "refresh_intervals": [ + "5s", + "10s", + "30s", + "1m", + "5m", + "15m", + "30m", + "1h", + "2h", + "1d" + ], + "time_options": [ + "5m", + "15m", + "1h", + "6h", + "12h", + "24h", + "2d", + "7d", + "30d" + ] + }, + "timezone": "UTC", + "title": "Thanos / BucketReplicate", + "uid": "49f644ecf8e31dd1a5084ae2a5f10e80", + "version": 0 +} diff --git a/charts/thanos-mixin/dashboards/compact.json b/charts/thanos-mixin/dashboards/compact.json new file mode 100644 index 00000000..4ffff26b --- /dev/null +++ b/charts/thanos-mixin/dashboards/compact.json @@ -0,0 +1,1820 @@ +{ + "annotations": { + "list": [ ] + }, + "editable": true, + "gnetId": null, + "graphTooltip": 0, + "hideControls": false, + "links": [ ], + "refresh": "10s", + "rows": [ + { + "collapse": false, + "height": "250px", + "panels": [ + { + "aliasColors": { }, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "$datasource", + "description": "Shows rate of execution for compactions against blocks that are stored in the bucket by compaction group.", + "fill": 10, + "id": 1, + "legend": { + "avg": false, + "current": false, + "max": false, + "min": false, + "show": true, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 0, + "links": [ ], + "nullPointMode": "null as zero", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [ ], + "spaceLength": 10, + "span": 6, + "stack": true, + "steppedLine": false, + "targets": [ + { + "expr": "sum by (job, group) (rate(thanos_compact_group_compactions_total{job=\"$job\"}[$interval]))", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "compaction {{job}} {{group}}", + "legendLink": null, + "step": 10 + } + ], + "thresholds": [ ], + "timeFrom": null, + "timeShift": null, + "title": "Rate", + "tooltip": { + "shared": false, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [ ] + }, + "yaxes": [ + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": 0, + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": false + } + ] + }, + { + "aliasColors": { + "error": "#E24D42" + }, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "$datasource", + "description": "Shows ratio of errors compared to the total number of executed compactions against blocks that are stored in the bucket.", + "fill": 10, + "id": 2, + "legend": { + "avg": false, + "current": false, + "max": false, + "min": false, + "show": true, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 0, + "links": [ ], + "nullPointMode": "null as zero", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [ ], + "spaceLength": 10, + "span": 6, + "stack": true, + "steppedLine": false, + "targets": [ + { + "expr": "sum by (job) (rate(thanos_compact_group_compactions_failures_total{job=\"$job\"}[$interval])) / sum by (job) (rate(thanos_compact_group_compactions_total{job=\"$job\"}[$interval]))", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "error", + "step": 10 + } + ], + "thresholds": [ ], + "timeFrom": null, + "timeShift": null, + "title": "Errors", + "tooltip": { + "shared": false, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [ ] + }, + "yaxes": [ + { + "format": "percentunit", + "label": null, + "logBase": 1, + "max": null, + "min": 0, + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": false + } + ] + } + ], + "repeat": null, + "repeatIteration": null, + "repeatRowId": null, + "showTitle": true, + "title": "Group Compaction", + "titleSize": "h6" + }, + { + "collapse": false, + "height": "250px", + "panels": [ + { + "aliasColors": { }, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "$datasource", + "description": "Shows rate of execution for downsampling against blocks that are stored in the bucket by compaction group.", + "fill": 10, + "id": 3, + "legend": { + "avg": false, + "current": false, + "max": false, + "min": false, + "show": true, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 0, + "links": [ ], + "nullPointMode": "null as zero", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [ ], + "spaceLength": 10, + "span": 6, + "stack": true, + "steppedLine": false, + "targets": [ + { + "expr": "sum by (job, group) (rate(thanos_compact_downsample_total{job=\"$job\"}[$interval]))", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "downsample {{job}} {{group}}", + "legendLink": null, + "step": 10 + } + ], + "thresholds": [ ], + "timeFrom": null, + "timeShift": null, + "title": "Rate", + "tooltip": { + "shared": false, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [ ] + }, + "yaxes": [ + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": 0, + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": false + } + ] + }, + { + "aliasColors": { + "error": "#E24D42" + }, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "$datasource", + "description": "Shows ratio of errors compared to the total number of executed downsampling against blocks that are stored in the bucket.", + "fill": 10, + "id": 4, + "legend": { + "avg": false, + "current": false, + "max": false, + "min": false, + "show": true, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 0, + "links": [ ], + "nullPointMode": "null as zero", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [ ], + "spaceLength": 10, + "span": 6, + "stack": true, + "steppedLine": false, + "targets": [ + { + "expr": "sum by (job) (rate(thanos_compact_downsample_failed_total{job=\"$job\"}[$interval])) / sum by (job) (rate(thanos_compact_downsample_total{job=\"$job\"}[$interval]))", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "error", + "step": 10 + } + ], + "thresholds": [ ], + "timeFrom": null, + "timeShift": null, + "title": "Errors", + "tooltip": { + "shared": false, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [ ] + }, + "yaxes": [ + { + "format": "percentunit", + "label": null, + "logBase": 1, + "max": null, + "min": 0, + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": false + } + ] + } + ], + "repeat": null, + "repeatIteration": null, + "repeatRowId": null, + "showTitle": true, + "title": "Downsample", + "titleSize": "h6" + }, + { + "collapse": false, + "height": "250px", + "panels": [ + { + "aliasColors": { }, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "$datasource", + "description": "Shows rate of execution for removals of blocks if their data is available as part of a block with a higher compaction level.", + "fill": 10, + "id": 5, + "legend": { + "avg": false, + "current": false, + "max": false, + "min": false, + "show": true, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 0, + "links": [ ], + "nullPointMode": "null as zero", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [ ], + "spaceLength": 10, + "span": 4, + "stack": true, + "steppedLine": false, + "targets": [ + { + "expr": "sum by (job) (rate(thanos_compact_garbage_collection_total{job=\"$job\"}[$interval]))", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "garbage collection {{job}}", + "legendLink": null, + "step": 10 + } + ], + "thresholds": [ ], + "timeFrom": null, + "timeShift": null, + "title": "Rate", + "tooltip": { + "shared": false, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [ ] + }, + "yaxes": [ + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": 0, + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": false + } + ] + }, + { + "aliasColors": { + "error": "#E24D42" + }, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "$datasource", + "description": "Shows ratio of errors compared to the total number of executed garbage collections.", + "fill": 10, + "id": 6, + "legend": { + "avg": false, + "current": false, + "max": false, + "min": false, + "show": true, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 0, + "links": [ ], + "nullPointMode": "null as zero", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [ ], + "spaceLength": 10, + "span": 4, + "stack": true, + "steppedLine": false, + "targets": [ + { + "expr": "sum by (job) (rate(thanos_compact_garbage_collection_failures_total{job=\"$job\"}[$interval])) / sum by (job) (rate(thanos_compact_garbage_collection_total{job=\"$job\"}[$interval]))", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "error", + "step": 10 + } + ], + "thresholds": [ ], + "timeFrom": null, + "timeShift": null, + "title": "Errors", + "tooltip": { + "shared": false, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [ ] + }, + "yaxes": [ + { + "format": "percentunit", + "label": null, + "logBase": 1, + "max": null, + "min": 0, + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": false + } + ] + }, + { + "aliasColors": { }, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "$datasource", + "description": "Shows how long has it taken to execute garbage collection in quantiles.", + "fill": 1, + "id": 7, + "legend": { + "avg": false, + "current": false, + "max": false, + "min": false, + "show": true, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 1, + "links": [ ], + "nullPointMode": "null as zero", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [ + { + "alias": "p99", + "color": "#FA6400", + "fill": 1, + "fillGradient": 1 + }, + { + "alias": "p90", + "color": "#E0B400", + "fill": 1, + "fillGradient": 1 + }, + { + "alias": "p50", + "color": "#37872D", + "fill": 10, + "fillGradient": 0 + } + ], + "spaceLength": 10, + "span": 4, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "histogram_quantile(0.50, sum by (job, le) (rate(thanos_compact_garbage_collection_duration_seconds_bucket{job=\"$job\"}[$interval]))) * 1", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "p50 {{job}}", + "logBase": 10, + "max": null, + "min": null, + "step": 10 + }, + { + "expr": "histogram_quantile(0.90, sum by (job, le) (rate(thanos_compact_garbage_collection_duration_seconds_bucket{job=\"$job\"}[$interval]))) * 1", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "p90 {{job}}", + "logBase": 10, + "max": null, + "min": null, + "step": 10 + }, + { + "expr": "histogram_quantile(0.99, sum by (job, le) (rate(thanos_compact_garbage_collection_duration_seconds_bucket{job=\"$job\"}[$interval]))) * 1", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "p99 {{job}}", + "logBase": 10, + "max": null, + "min": null, + "step": 10 + } + ], + "thresholds": [ ], + "timeFrom": null, + "timeShift": null, + "title": "Duration", + "tooltip": { + "shared": false, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [ ] + }, + "yaxes": [ + { + "format": "s", + "label": null, + "logBase": 1, + "max": null, + "min": 0, + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": false + } + ] + } + ], + "repeat": null, + "repeatIteration": null, + "repeatRowId": null, + "showTitle": true, + "title": "Garbage Collection", + "titleSize": "h6" + }, + { + "collapse": false, + "height": "250px", + "panels": [ + { + "aliasColors": { }, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "$datasource", + "description": "Shows deletion rate of blocks already marked for deletion.", + "fill": 10, + "id": 8, + "legend": { + "avg": false, + "current": false, + "max": false, + "min": false, + "show": true, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 0, + "links": [ ], + "nullPointMode": "null as zero", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [ ], + "spaceLength": 10, + "span": 4, + "stack": true, + "steppedLine": false, + "targets": [ + { + "expr": "sum by (job) (rate(thanos_compact_blocks_cleaned_total{job=\"$job\"}[$interval]))", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "Blocks cleanup {{job}}", + "legendLink": null, + "step": 10 + } + ], + "thresholds": [ ], + "timeFrom": null, + "timeShift": null, + "title": "Deletion Rate", + "tooltip": { + "shared": false, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [ ] + }, + "yaxes": [ + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": 0, + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": false + } + ] + }, + { + "aliasColors": { }, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "$datasource", + "description": "Shows deletion failures rate of blocks already marked for deletion.", + "fill": 1, + "id": 9, + "legend": { + "avg": false, + "current": false, + "max": false, + "min": false, + "show": true, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 1, + "links": [ ], + "nullPointMode": "null as zero", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [ ], + "spaceLength": 10, + "span": 4, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "sum by (job) (rate(thanos_compact_block_cleanup_failures_total{job=\"$job\"}[$interval]))", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "Blocks cleanup failures {{job}}", + "legendLink": null, + "step": 10 + } + ], + "thresholds": [ ], + "timeFrom": null, + "timeShift": null, + "title": "Deletion Error Rate", + "tooltip": { + "shared": false, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [ ] + }, + "yaxes": [ + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": 0, + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": false + } + ] + }, + { + "aliasColors": { }, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "$datasource", + "description": "Shows rate at which blocks are marked for deletion (from GC and retention policy).", + "fill": 1, + "id": 10, + "legend": { + "avg": false, + "current": false, + "max": false, + "min": false, + "show": true, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 1, + "links": [ ], + "nullPointMode": "null as zero", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [ ], + "spaceLength": 10, + "span": 4, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "sum by (job) (rate(thanos_compact_blocks_marked_for_deletion_total{job=\"$job\"}[$interval]))", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "Blocks marked {{job}}", + "legendLink": null, + "step": 10 + } + ], + "thresholds": [ ], + "timeFrom": null, + "timeShift": null, + "title": "Marking Rate", + "tooltip": { + "shared": false, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [ ] + }, + "yaxes": [ + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": 0, + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": false + } + ] + } + ], + "repeat": null, + "repeatIteration": null, + "repeatRowId": null, + "showTitle": true, + "title": "Blocks deletion", + "titleSize": "h6" + }, + { + "collapse": false, + "height": "250px", + "panels": [ + { + "aliasColors": { }, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "$datasource", + "description": "Shows rate of execution for all meta files from blocks in the bucket into the memory.", + "fill": 10, + "id": 11, + "legend": { + "avg": false, + "current": false, + "max": false, + "min": false, + "show": true, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 0, + "links": [ ], + "nullPointMode": "null as zero", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [ ], + "spaceLength": 10, + "span": 4, + "stack": true, + "steppedLine": false, + "targets": [ + { + "expr": "sum by (job) (rate(thanos_blocks_meta_syncs_total{job=\"$job\"}[$interval]))", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "sync {{job}}", + "legendLink": null, + "step": 10 + } + ], + "thresholds": [ ], + "timeFrom": null, + "timeShift": null, + "title": "Rate", + "tooltip": { + "shared": false, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [ ] + }, + "yaxes": [ + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": 0, + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": false + } + ] + }, + { + "aliasColors": { + "error": "#E24D42" + }, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "$datasource", + "description": "Shows ratio of errors compared to the total number of executed meta file sync.", + "fill": 10, + "id": 12, + "legend": { + "avg": false, + "current": false, + "max": false, + "min": false, + "show": true, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 0, + "links": [ ], + "nullPointMode": "null as zero", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [ ], + "spaceLength": 10, + "span": 4, + "stack": true, + "steppedLine": false, + "targets": [ + { + "expr": "sum by (job) (rate(thanos_blocks_meta_sync_failures_total{job=\"$job\"}[$interval])) / sum by (job) (rate(thanos_blocks_meta_syncs_total{job=\"$job\"}[$interval]))", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "error", + "step": 10 + } + ], + "thresholds": [ ], + "timeFrom": null, + "timeShift": null, + "title": "Errors", + "tooltip": { + "shared": false, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [ ] + }, + "yaxes": [ + { + "format": "percentunit", + "label": null, + "logBase": 1, + "max": null, + "min": 0, + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": false + } + ] + }, + { + "aliasColors": { }, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "$datasource", + "description": "Shows how long has it taken to execute meta file sync, in quantiles.", + "fill": 1, + "id": 13, + "legend": { + "avg": false, + "current": false, + "max": false, + "min": false, + "show": true, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 1, + "links": [ ], + "nullPointMode": "null as zero", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [ + { + "alias": "p99", + "color": "#FA6400", + "fill": 1, + "fillGradient": 1 + }, + { + "alias": "p90", + "color": "#E0B400", + "fill": 1, + "fillGradient": 1 + }, + { + "alias": "p50", + "color": "#37872D", + "fill": 10, + "fillGradient": 0 + } + ], + "spaceLength": 10, + "span": 4, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "histogram_quantile(0.50, sum by (job, le) (rate(thanos_blocks_meta_sync_duration_seconds_bucket{job=\"$job\"}[$interval]))) * 1", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "p50 {{job}}", + "logBase": 10, + "max": null, + "min": null, + "step": 10 + }, + { + "expr": "histogram_quantile(0.90, sum by (job, le) (rate(thanos_blocks_meta_sync_duration_seconds_bucket{job=\"$job\"}[$interval]))) * 1", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "p90 {{job}}", + "logBase": 10, + "max": null, + "min": null, + "step": 10 + }, + { + "expr": "histogram_quantile(0.99, sum by (job, le) (rate(thanos_blocks_meta_sync_duration_seconds_bucket{job=\"$job\"}[$interval]))) * 1", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "p99 {{job}}", + "logBase": 10, + "max": null, + "min": null, + "step": 10 + } + ], + "thresholds": [ ], + "timeFrom": null, + "timeShift": null, + "title": "Duration", + "tooltip": { + "shared": false, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [ ] + }, + "yaxes": [ + { + "format": "s", + "label": null, + "logBase": 1, + "max": null, + "min": 0, + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": false + } + ] + } + ], + "repeat": null, + "repeatIteration": null, + "repeatRowId": null, + "showTitle": true, + "title": "Sync Meta", + "titleSize": "h6" + }, + { + "collapse": false, + "height": "250px", + "panels": [ + { + "aliasColors": { }, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "$datasource", + "description": "Shows rate of execution for operations against the bucket.", + "fill": 10, + "id": 14, + "legend": { + "avg": false, + "current": false, + "max": false, + "min": false, + "show": true, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 0, + "links": [ ], + "nullPointMode": "null as zero", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [ ], + "spaceLength": 10, + "span": 4, + "stack": true, + "steppedLine": false, + "targets": [ + { + "expr": "sum by (job, operation) (rate(thanos_objstore_bucket_operations_total{job=\"$job\"}[$interval]))", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "{{job}} {{operation}}", + "legendLink": null, + "step": 10 + } + ], + "thresholds": [ ], + "timeFrom": null, + "timeShift": null, + "title": "Rate", + "tooltip": { + "shared": false, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [ ] + }, + "yaxes": [ + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": 0, + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": false + } + ] + }, + { + "aliasColors": { + "error": "#E24D42" + }, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "$datasource", + "description": "Shows ratio of errors compared to the total number of executed operations against the bucket.", + "fill": 10, + "id": 15, + "legend": { + "avg": false, + "current": false, + "max": false, + "min": false, + "show": true, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 0, + "links": [ ], + "nullPointMode": "null as zero", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [ ], + "spaceLength": 10, + "span": 4, + "stack": true, + "steppedLine": false, + "targets": [ + { + "expr": "sum by (job) (rate(thanos_objstore_bucket_operation_failures_total{job=\"$job\"}[$interval])) / sum by (job) (rate(thanos_objstore_bucket_operations_total{job=\"$job\"}[$interval]))", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "error", + "step": 10 + } + ], + "thresholds": [ ], + "timeFrom": null, + "timeShift": null, + "title": "Errors", + "tooltip": { + "shared": false, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [ ] + }, + "yaxes": [ + { + "format": "percentunit", + "label": null, + "logBase": 1, + "max": null, + "min": 0, + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": false + } + ] + }, + { + "aliasColors": { }, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "$datasource", + "description": "Shows how long has it taken to execute operations against the bucket, in quantiles.", + "fill": 1, + "id": 16, + "legend": { + "avg": false, + "current": false, + "max": false, + "min": false, + "show": true, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 1, + "links": [ ], + "nullPointMode": "null as zero", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [ + { + "alias": "p99", + "color": "#FA6400", + "fill": 1, + "fillGradient": 1 + }, + { + "alias": "p90", + "color": "#E0B400", + "fill": 1, + "fillGradient": 1 + }, + { + "alias": "p50", + "color": "#37872D", + "fill": 10, + "fillGradient": 0 + } + ], + "spaceLength": 10, + "span": 4, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "histogram_quantile(0.50, sum by (job, le) (rate(thanos_objstore_bucket_operation_duration_seconds_bucket{job=\"$job\"}[$interval]))) * 1", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "p50 {{job}}", + "logBase": 10, + "max": null, + "min": null, + "step": 10 + }, + { + "expr": "histogram_quantile(0.90, sum by (job, le) (rate(thanos_objstore_bucket_operation_duration_seconds_bucket{job=\"$job\"}[$interval]))) * 1", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "p90 {{job}}", + "logBase": 10, + "max": null, + "min": null, + "step": 10 + }, + { + "expr": "histogram_quantile(0.99, sum by (job, le) (rate(thanos_objstore_bucket_operation_duration_seconds_bucket{job=\"$job\"}[$interval]))) * 1", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "p99 {{job}}", + "logBase": 10, + "max": null, + "min": null, + "step": 10 + } + ], + "thresholds": [ ], + "timeFrom": null, + "timeShift": null, + "title": "Duration", + "tooltip": { + "shared": false, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [ ] + }, + "yaxes": [ + { + "format": "s", + "label": null, + "logBase": 1, + "max": null, + "min": 0, + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": false + } + ] + } + ], + "repeat": null, + "repeatIteration": null, + "repeatRowId": null, + "showTitle": true, + "title": "Object Store Operations", + "titleSize": "h6" + }, + { + "collapse": true, + "height": "250px", + "panels": [ + { + "aliasColors": { }, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "$datasource", + "fill": 1, + "id": 17, + "legend": { + "avg": false, + "current": false, + "max": false, + "min": false, + "show": true, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 1, + "links": [ ], + "nullPointMode": "null as zero", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [ ], + "spaceLength": 10, + "span": 4, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "go_memstats_alloc_bytes{job=\"$job\"}", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "alloc all {{instance}}", + "legendLink": null, + "step": 10 + }, + { + "expr": "go_memstats_heap_alloc_bytes{job=\"$job\"}", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "alloc heap {{instance}}", + "legendLink": null, + "step": 10 + }, + { + "expr": "rate(go_memstats_alloc_bytes_total{job=\"$job\"}[30s])", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "alloc rate all {{instance}}", + "legendLink": null, + "step": 10 + }, + { + "expr": "rate(go_memstats_heap_alloc_bytes{job=\"$job\"}[30s])", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "alloc rate heap {{instance}}", + "legendLink": null, + "step": 10 + }, + { + "expr": "go_memstats_stack_inuse_bytes{job=\"$job\"}", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "inuse heap {{instance}}", + "legendLink": null, + "step": 10 + }, + { + "expr": "go_memstats_heap_inuse_bytes{job=\"$job\"}", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "inuse stack {{instance}}", + "legendLink": null, + "step": 10 + } + ], + "thresholds": [ ], + "timeFrom": null, + "timeShift": null, + "title": "Memory Used", + "tooltip": { + "shared": false, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [ ] + }, + "yaxes": [ + { + "format": "bytes", + "label": null, + "logBase": 1, + "max": null, + "min": 0, + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": false + } + ] + }, + { + "aliasColors": { }, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "$datasource", + "fill": 1, + "id": 18, + "legend": { + "avg": false, + "current": false, + "max": false, + "min": false, + "show": true, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 1, + "links": [ ], + "nullPointMode": "null as zero", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [ ], + "spaceLength": 10, + "span": 4, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "go_goroutines{job=\"$job\"}", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "{{instance}}", + "legendLink": null, + "step": 10 + } + ], + "thresholds": [ ], + "timeFrom": null, + "timeShift": null, + "title": "Goroutines", + "tooltip": { + "shared": false, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [ ] + }, + "yaxes": [ + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": 0, + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": false + } + ] + }, + { + "aliasColors": { }, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "$datasource", + "fill": 1, + "id": 19, + "legend": { + "avg": false, + "current": false, + "max": false, + "min": false, + "show": true, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 1, + "links": [ ], + "nullPointMode": "null as zero", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [ ], + "spaceLength": 10, + "span": 4, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "go_gc_duration_seconds{job=\"$job\"}", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "{{quantile}} {{instance}}", + "legendLink": null, + "step": 10 + } + ], + "thresholds": [ ], + "timeFrom": null, + "timeShift": null, + "title": "GC Time Quantiles", + "tooltip": { + "shared": false, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [ ] + }, + "yaxes": [ + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": 0, + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": false + } + ] + } + ], + "repeat": null, + "repeatIteration": null, + "repeatRowId": null, + "showTitle": true, + "title": "Resources", + "titleSize": "h6" + } + ], + "schemaVersion": 14, + "style": "dark", + "tags": [ + "thanos-mixin", + "portefaix-v0.6.0" + ], + "templating": { + "list": [ + { + "current": { + "text": "default", + "value": "default" + }, + "hide": 0, + "label": null, + "name": "datasource", + "options": [ ], + "query": "prometheus", + "refresh": 1, + "regex": "", + "type": "datasource" + }, + { + "auto": true, + "auto_count": 300, + "auto_min": "10s", + "current": { + "text": "5m", + "value": "5m" + }, + "hide": 0, + "label": "interval", + "name": "interval", + "query": "5m,10m,30m,1h,6h,12h", + "refresh": 2, + "type": "interval" + }, + { + "allValue": null, + "current": { + "text": "all", + "value": "$__all" + }, + "datasource": "$datasource", + "hide": 0, + "includeAll": true, + "label": "job", + "multi": false, + "name": "job", + "options": [ ], + "query": "label_values(up{job=~\".*thanos-compact.*\"}, job)", + "refresh": 1, + "regex": "", + "sort": 2, + "tagValuesQuery": "", + "tags": [ ], + "tagsQuery": "", + "type": "query", + "useTags": false + } + ] + }, + "time": { + "from": "now-1h", + "to": "now" + }, + "timepicker": { + "refresh_intervals": [ + "5s", + "10s", + "30s", + "1m", + "5m", + "15m", + "30m", + "1h", + "2h", + "1d" + ], + "time_options": [ + "5m", + "15m", + "1h", + "6h", + "12h", + "24h", + "2d", + "7d", + "30d" + ] + }, + "timezone": "UTC", + "title": "Thanos / Compact", + "uid": "651943d05a8123e32867b4673963f42b", + "version": 0 +} diff --git a/charts/thanos-mixin/dashboards/overview.json b/charts/thanos-mixin/dashboards/overview.json new file mode 100644 index 00000000..ab1244a5 --- /dev/null +++ b/charts/thanos-mixin/dashboards/overview.json @@ -0,0 +1,2182 @@ +{ + "annotations": { + "list": [ ] + }, + "editable": true, + "gnetId": null, + "graphTooltip": 0, + "hideControls": false, + "links": [ ], + "refresh": "10s", + "rows": [ + { + "collapse": false, + "height": "250px", + "panels": [ + { + "aliasColors": { }, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "$datasource", + "description": "Shows rate of requests against /query for the given time.", + "fill": 10, + "id": 1, + "legend": { + "avg": false, + "current": false, + "max": false, + "min": false, + "show": true, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 0, + "links": [ + { + "dashboard": "Thanos / Query", + "includeVars": true, + "keepTime": true, + "title": "Thanos / Query", + "type": "dashboard" + } + ], + "nullPointMode": "null as zero", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [ + { + "alias": "/1../", + "color": "#EAB839" + }, + { + "alias": "/2../", + "color": "#37872D" + }, + { + "alias": "/3../", + "color": "#E0B400" + }, + { + "alias": "/4../", + "color": "#1F60C4" + }, + { + "alias": "/5../", + "color": "#C4162A" + } + ], + "spaceLength": 10, + "span": 4, + "stack": true, + "steppedLine": false, + "targets": [ + { + "expr": "sum by (job, handler, code) (rate(http_requests_total{handler=\"query\"}[$interval]))", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "{{job}} {{handler}} {{code}}", + "step": 10 + } + ], + "thresholds": [ ], + "timeFrom": null, + "timeShift": null, + "title": "Requests Rate", + "tooltip": { + "shared": false, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [ ] + }, + "yaxes": [ + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": 0, + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": false + } + ] + }, + { + "aliasColors": { + "error": "#E24D42" + }, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "$datasource", + "description": "Shows ratio of errors compared to the the total number of handled requests against /query.", + "fill": 10, + "id": 2, + "legend": { + "avg": false, + "current": false, + "max": false, + "min": false, + "show": true, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 0, + "links": [ + { + "dashboard": "Thanos / Query", + "includeVars": true, + "keepTime": true, + "title": "Thanos / Query", + "type": "dashboard" + } + ], + "nullPointMode": "null as zero", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [ ], + "spaceLength": 10, + "span": 4, + "stack": true, + "steppedLine": false, + "targets": [ + { + "expr": "sum by (job) (rate(http_requests_total{handler=\"query\",code=~\"5..\"}[$interval])) / sum by (job) (rate(http_requests_total{handler=\"query\"}[$interval]))", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "error", + "step": 10 + } + ], + "thresholds": [ ], + "timeFrom": null, + "timeShift": null, + "title": "Requests Errors", + "tooltip": { + "shared": false, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [ ] + }, + "yaxes": [ + { + "format": "percentunit", + "label": null, + "logBase": 1, + "max": null, + "min": 0, + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": false + } + ] + }, + { + "aliasColors": { }, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "$datasource", + "description": "Shows how long has it taken to handle requests.", + "fill": 1, + "id": 3, + "legend": { + "avg": false, + "current": false, + "max": false, + "min": false, + "show": true, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 1, + "links": [ + { + "dashboard": "Thanos / Query", + "includeVars": true, + "keepTime": true, + "title": "Thanos / Query", + "type": "dashboard" + } + ], + "nullPointMode": "null as zero", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [ ], + "spaceLength": 10, + "span": 4, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "histogram_quantile(0.99, sum by (job, le) (rate(http_request_duration_seconds_bucket{handler=\"query\"}[$interval])))", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "{{job}} P99", + "legendLink": null, + "step": 10 + } + ], + "thresholds": [ + { + "colorMode": "warning", + "fill": true, + "line": true, + "op": "gt", + "value": 0.5, + "yaxis": "left" + }, + { + "colorMode": "critical", + "fill": true, + "line": true, + "op": "gt", + "value": 1, + "yaxis": "left" + } + ], + "timeFrom": null, + "timeShift": null, + "title": "Latency 99th Percentile", + "tooltip": { + "shared": false, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [ ] + }, + "yaxes": [ + { + "format": "s", + "label": null, + "logBase": 1, + "max": null, + "min": 0, + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": false + } + ] + } + ], + "repeat": null, + "repeatIteration": null, + "repeatRowId": null, + "showTitle": true, + "title": "Instant Query", + "titleSize": "h6" + }, + { + "collapse": false, + "height": "250px", + "panels": [ + { + "aliasColors": { }, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "$datasource", + "description": "Shows rate of requests against /query_range for the given time range.", + "fill": 10, + "id": 4, + "legend": { + "avg": false, + "current": false, + "max": false, + "min": false, + "show": true, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 0, + "links": [ + { + "dashboard": "Thanos / Query", + "includeVars": true, + "keepTime": true, + "title": "Thanos / Query", + "type": "dashboard" + } + ], + "nullPointMode": "null as zero", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [ + { + "alias": "/1../", + "color": "#EAB839" + }, + { + "alias": "/2../", + "color": "#37872D" + }, + { + "alias": "/3../", + "color": "#E0B400" + }, + { + "alias": "/4../", + "color": "#1F60C4" + }, + { + "alias": "/5../", + "color": "#C4162A" + } + ], + "spaceLength": 10, + "span": 4, + "stack": true, + "steppedLine": false, + "targets": [ + { + "expr": "sum by (job, handler, code) (rate(http_requests_total{handler=\"query_range\"}[$interval]))", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "{{job}} {{handler}} {{code}}", + "step": 10 + } + ], + "thresholds": [ ], + "timeFrom": null, + "timeShift": null, + "title": "Requests Rate", + "tooltip": { + "shared": false, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [ ] + }, + "yaxes": [ + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": 0, + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": false + } + ] + }, + { + "aliasColors": { + "error": "#E24D42" + }, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "$datasource", + "description": "Shows ratio of errors compared to the the total number of handled requests against /query_range.", + "fill": 10, + "id": 5, + "legend": { + "avg": false, + "current": false, + "max": false, + "min": false, + "show": true, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 0, + "links": [ + { + "dashboard": "Thanos / Query", + "includeVars": true, + "keepTime": true, + "title": "Thanos / Query", + "type": "dashboard" + } + ], + "nullPointMode": "null as zero", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [ ], + "spaceLength": 10, + "span": 4, + "stack": true, + "steppedLine": false, + "targets": [ + { + "expr": "sum by (job) (rate(http_requests_total{handler=\"query_range\",code=~\"5..\"}[$interval])) / sum by (job) (rate(http_requests_total{handler=\"query_range\"}[$interval]))", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "error", + "step": 10 + } + ], + "thresholds": [ ], + "timeFrom": null, + "timeShift": null, + "title": "Requests Errors", + "tooltip": { + "shared": false, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [ ] + }, + "yaxes": [ + { + "format": "percentunit", + "label": null, + "logBase": 1, + "max": null, + "min": 0, + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": false + } + ] + }, + { + "aliasColors": { }, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "$datasource", + "description": "Shows how long has it taken to handle requests.", + "fill": 1, + "id": 6, + "legend": { + "avg": false, + "current": false, + "max": false, + "min": false, + "show": true, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 1, + "links": [ + { + "dashboard": "Thanos / Query", + "includeVars": true, + "keepTime": true, + "title": "Thanos / Query", + "type": "dashboard" + } + ], + "nullPointMode": "null as zero", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [ ], + "spaceLength": 10, + "span": 4, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "histogram_quantile(0.99, sum by (job, le) (rate(http_request_duration_seconds_bucket{handler=\"query_range\"}[$interval])))", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "{{job}} P99", + "legendLink": null, + "step": 10 + } + ], + "thresholds": [ + { + "colorMode": "warning", + "fill": true, + "line": true, + "op": "gt", + "value": 0.5, + "yaxis": "left" + }, + { + "colorMode": "critical", + "fill": true, + "line": true, + "op": "gt", + "value": 1, + "yaxis": "left" + } + ], + "timeFrom": null, + "timeShift": null, + "title": "Latency 99th Percentile", + "tooltip": { + "shared": false, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [ ] + }, + "yaxes": [ + { + "format": "s", + "label": null, + "logBase": 1, + "max": null, + "min": 0, + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": false + } + ] + } + ], + "repeat": null, + "repeatIteration": null, + "repeatRowId": null, + "showTitle": true, + "title": "Range Query", + "titleSize": "h6" + }, + { + "collapse": false, + "height": "250px", + "panels": [ + { + "aliasColors": { }, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "$datasource", + "description": "Shows rate of handled Unary gRPC requests from queriers.", + "fill": 10, + "id": 7, + "legend": { + "avg": false, + "current": false, + "max": false, + "min": false, + "show": true, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 0, + "links": [ + { + "dashboard": "Thanos / Store", + "includeVars": true, + "keepTime": true, + "title": "Thanos / Store", + "type": "dashboard" + } + ], + "nullPointMode": "null as zero", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [ + { + "alias": "/Aborted/", + "color": "#EAB839" + }, + { + "alias": "/AlreadyExists/", + "color": "#37872D" + }, + { + "alias": "/FailedPrecondition/", + "color": "#E0B400" + }, + { + "alias": "/Unimplemented/", + "color": "#E0B400" + }, + { + "alias": "/InvalidArgument/", + "color": "#1F60C4" + }, + { + "alias": "/NotFound/", + "color": "#1F60C4" + }, + { + "alias": "/PermissionDenied/", + "color": "#1F60C4" + }, + { + "alias": "/Unauthenticated/", + "color": "#1F60C4" + }, + { + "alias": "/Canceled/", + "color": "#C4162A" + }, + { + "alias": "/DataLoss/", + "color": "#C4162A" + }, + { + "alias": "/DeadlineExceeded/", + "color": "#C4162A" + }, + { + "alias": "/Internal/", + "color": "#C4162A" + }, + { + "alias": "/OutOfRange/", + "color": "#C4162A" + }, + { + "alias": "/ResourceExhausted/", + "color": "#C4162A" + }, + { + "alias": "/Unavailable/", + "color": "#C4162A" + }, + { + "alias": "/Unknown/", + "color": "#C4162A" + }, + { + "alias": "/OK/", + "color": "#37872D" + }, + { + "alias": "error", + "color": "#C4162A" + } + ], + "spaceLength": 10, + "span": 4, + "stack": true, + "steppedLine": false, + "targets": [ + { + "expr": "sum by (job, grpc_method, grpc_code) (rate(grpc_server_handled_total{grpc_type=\"unary\"}[$interval]))", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "{{job}} {{grpc_method}} {{grpc_code}}", + "step": 10 + } + ], + "thresholds": [ ], + "timeFrom": null, + "timeShift": null, + "title": "gPRC (Unary) Rate", + "tooltip": { + "shared": false, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [ ] + }, + "yaxes": [ + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": 0, + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": false + } + ] + }, + { + "aliasColors": { + "error": "#E24D42" + }, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "$datasource", + "description": "Shows ratio of errors compared to the total number of handled requests from queriers.", + "fill": 10, + "id": 8, + "legend": { + "avg": false, + "current": false, + "max": false, + "min": false, + "show": true, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 0, + "links": [ + { + "dashboard": "Thanos / Store", + "includeVars": true, + "keepTime": true, + "title": "Thanos / Store", + "type": "dashboard" + } + ], + "nullPointMode": "null as zero", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [ ], + "spaceLength": 10, + "span": 4, + "stack": true, + "steppedLine": false, + "targets": [ + { + "expr": "sum by (job) (rate(grpc_server_handled_total{grpc_code=~\"Unknown|ResourceExhausted|Internal|Unavailable|DataLoss\",grpc_type=\"unary\"}[$interval])) / sum by (job) (rate(grpc_server_handled_total{grpc_type=\"unary\"}[$interval]))", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "error", + "step": 10 + } + ], + "thresholds": [ ], + "timeFrom": null, + "timeShift": null, + "title": "gPRC (Unary) Errors", + "tooltip": { + "shared": false, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [ ] + }, + "yaxes": [ + { + "format": "percentunit", + "label": null, + "logBase": 1, + "max": null, + "min": 0, + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": false + } + ] + }, + { + "aliasColors": { }, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "$datasource", + "description": "Shows how long has it taken to handle requests from queriers.", + "fill": 1, + "id": 9, + "legend": { + "avg": false, + "current": false, + "max": false, + "min": false, + "show": true, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 1, + "links": [ + { + "dashboard": "Thanos / Store", + "includeVars": true, + "keepTime": true, + "title": "Thanos / Store", + "type": "dashboard" + } + ], + "nullPointMode": "null as zero", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [ ], + "spaceLength": 10, + "span": 4, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "histogram_quantile(0.99, sum by (job, le) (rate(grpc_server_handling_seconds_bucket{grpc_type=\"unary\"}[$interval])))", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "{{job}} P99", + "legendLink": null, + "step": 10 + } + ], + "thresholds": [ + { + "colorMode": "warning", + "fill": true, + "line": true, + "op": "gt", + "value": 0.5, + "yaxis": "left" + }, + { + "colorMode": "critical", + "fill": true, + "line": true, + "op": "gt", + "value": 1, + "yaxis": "left" + } + ], + "timeFrom": null, + "timeShift": null, + "title": "gRPC Latency 99th Percentile", + "tooltip": { + "shared": false, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [ ] + }, + "yaxes": [ + { + "format": "s", + "label": null, + "logBase": 1, + "max": null, + "min": 0, + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": false + } + ] + } + ], + "repeat": null, + "repeatIteration": null, + "repeatRowId": null, + "showTitle": true, + "title": "Store", + "titleSize": "h6" + }, + { + "collapse": false, + "height": "250px", + "panels": [ + { + "aliasColors": { }, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "$datasource", + "description": "Shows rate of handled Unary gRPC requests from queriers.", + "fill": 10, + "id": 10, + "legend": { + "avg": false, + "current": false, + "max": false, + "min": false, + "show": true, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 0, + "links": [ + { + "dashboard": "Thanos / Sidecar", + "includeVars": true, + "keepTime": true, + "title": "Thanos / Sidecar", + "type": "dashboard" + } + ], + "nullPointMode": "null as zero", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [ + { + "alias": "/Aborted/", + "color": "#EAB839" + }, + { + "alias": "/AlreadyExists/", + "color": "#37872D" + }, + { + "alias": "/FailedPrecondition/", + "color": "#E0B400" + }, + { + "alias": "/Unimplemented/", + "color": "#E0B400" + }, + { + "alias": "/InvalidArgument/", + "color": "#1F60C4" + }, + { + "alias": "/NotFound/", + "color": "#1F60C4" + }, + { + "alias": "/PermissionDenied/", + "color": "#1F60C4" + }, + { + "alias": "/Unauthenticated/", + "color": "#1F60C4" + }, + { + "alias": "/Canceled/", + "color": "#C4162A" + }, + { + "alias": "/DataLoss/", + "color": "#C4162A" + }, + { + "alias": "/DeadlineExceeded/", + "color": "#C4162A" + }, + { + "alias": "/Internal/", + "color": "#C4162A" + }, + { + "alias": "/OutOfRange/", + "color": "#C4162A" + }, + { + "alias": "/ResourceExhausted/", + "color": "#C4162A" + }, + { + "alias": "/Unavailable/", + "color": "#C4162A" + }, + { + "alias": "/Unknown/", + "color": "#C4162A" + }, + { + "alias": "/OK/", + "color": "#37872D" + }, + { + "alias": "error", + "color": "#C4162A" + } + ], + "spaceLength": 10, + "span": 4, + "stack": true, + "steppedLine": false, + "targets": [ + { + "expr": "sum by (job, grpc_method, grpc_code) (rate(grpc_server_handled_total{grpc_type=\"unary\"}[$interval]))", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "{{job}} {{grpc_method}} {{grpc_code}}", + "step": 10 + } + ], + "thresholds": [ ], + "timeFrom": null, + "timeShift": null, + "title": "gPRC (Unary) Rate", + "tooltip": { + "shared": false, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [ ] + }, + "yaxes": [ + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": 0, + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": false + } + ] + }, + { + "aliasColors": { + "error": "#E24D42" + }, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "$datasource", + "description": "Shows ratio of errors compared to the total number of handled requests from queriers.", + "fill": 10, + "id": 11, + "legend": { + "avg": false, + "current": false, + "max": false, + "min": false, + "show": true, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 0, + "links": [ + { + "dashboard": "Thanos / Sidecar", + "includeVars": true, + "keepTime": true, + "title": "Thanos / Sidecar", + "type": "dashboard" + } + ], + "nullPointMode": "null as zero", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [ ], + "spaceLength": 10, + "span": 4, + "stack": true, + "steppedLine": false, + "targets": [ + { + "expr": "sum by (job) (rate(grpc_server_handled_total{grpc_code=~\"Unknown|ResourceExhausted|Internal|Unavailable|DataLoss\",grpc_type=\"unary\"}[$interval])) / sum by (job) (rate(grpc_server_handled_total{grpc_type=\"unary\"}[$interval]))", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "error", + "step": 10 + } + ], + "thresholds": [ ], + "timeFrom": null, + "timeShift": null, + "title": "gPRC (Unary) Errors", + "tooltip": { + "shared": false, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [ ] + }, + "yaxes": [ + { + "format": "percentunit", + "label": null, + "logBase": 1, + "max": null, + "min": 0, + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": false + } + ] + }, + { + "aliasColors": { }, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "$datasource", + "description": "Shows how long has it taken to handle requests from queriers, in quantiles.", + "fill": 1, + "id": 12, + "legend": { + "avg": false, + "current": false, + "max": false, + "min": false, + "show": true, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 1, + "links": [ + { + "dashboard": "Thanos / Sidecar", + "includeVars": true, + "keepTime": true, + "title": "Thanos / Sidecar", + "type": "dashboard" + } + ], + "nullPointMode": "null as zero", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [ ], + "spaceLength": 10, + "span": 4, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "histogram_quantile(0.99, sum by (job, le) (rate(grpc_server_handling_seconds_bucket{grpc_type=\"unary\"}[$interval])))", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "{{job}} P99", + "legendLink": null, + "step": 10 + } + ], + "thresholds": [ + { + "colorMode": "warning", + "fill": true, + "line": true, + "op": "gt", + "value": 0.5, + "yaxis": "left" + }, + { + "colorMode": "critical", + "fill": true, + "line": true, + "op": "gt", + "value": 1, + "yaxis": "left" + } + ], + "timeFrom": null, + "timeShift": null, + "title": "gPRC (Unary) Latency 99th Percentile", + "tooltip": { + "shared": false, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [ ] + }, + "yaxes": [ + { + "format": "s", + "label": null, + "logBase": 1, + "max": null, + "min": 0, + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": false + } + ] + } + ], + "repeat": null, + "repeatIteration": null, + "repeatRowId": null, + "showTitle": true, + "title": "Sidecar", + "titleSize": "h6" + }, + { + "collapse": false, + "height": "250px", + "panels": [ + { + "aliasColors": { }, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "$datasource", + "description": "Shows rate of incoming requests.", + "fill": 10, + "id": 13, + "legend": { + "avg": false, + "current": false, + "max": false, + "min": false, + "show": true, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 0, + "links": [ + { + "dashboard": "Thanos / Receive", + "includeVars": true, + "keepTime": true, + "title": "Thanos / Receive", + "type": "dashboard" + } + ], + "nullPointMode": "null as zero", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [ + { + "alias": "/1../", + "color": "#EAB839" + }, + { + "alias": "/2../", + "color": "#37872D" + }, + { + "alias": "/3../", + "color": "#E0B400" + }, + { + "alias": "/4../", + "color": "#1F60C4" + }, + { + "alias": "/5../", + "color": "#C4162A" + } + ], + "spaceLength": 10, + "span": 4, + "stack": true, + "steppedLine": false, + "targets": [ + { + "expr": "sum by (job, handler, code) (rate(http_requests_total{handler=\"receive\"}[$interval]))", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "{{job}} {{handler}} {{code}}", + "step": 10 + } + ], + "thresholds": [ ], + "timeFrom": null, + "timeShift": null, + "title": "Incoming Requests Rate", + "tooltip": { + "shared": false, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [ ] + }, + "yaxes": [ + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": 0, + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": false + } + ] + }, + { + "aliasColors": { + "error": "#E24D42" + }, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "$datasource", + "description": "Shows ratio of errors compared to the total number of handled incoming requests.", + "fill": 10, + "id": 14, + "legend": { + "avg": false, + "current": false, + "max": false, + "min": false, + "show": true, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 0, + "links": [ + { + "dashboard": "Thanos / Receive", + "includeVars": true, + "keepTime": true, + "title": "Thanos / Receive", + "type": "dashboard" + } + ], + "nullPointMode": "null as zero", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [ ], + "spaceLength": 10, + "span": 4, + "stack": true, + "steppedLine": false, + "targets": [ + { + "expr": "sum by (job) (rate(http_requests_total{handler=\"receive\",code=~\"5..\"}[$interval])) / sum by (job) (rate(http_requests_total{handler=\"receive\"}[$interval]))", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "error", + "step": 10 + } + ], + "thresholds": [ ], + "timeFrom": null, + "timeShift": null, + "title": "Incoming Requests Errors", + "tooltip": { + "shared": false, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [ ] + }, + "yaxes": [ + { + "format": "percentunit", + "label": null, + "logBase": 1, + "max": null, + "min": 0, + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": false + } + ] + }, + { + "aliasColors": { }, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "$datasource", + "description": "Shows how long has it taken to handle incoming requests.", + "fill": 1, + "id": 15, + "legend": { + "avg": false, + "current": false, + "max": false, + "min": false, + "show": true, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 1, + "links": [ + { + "dashboard": "Thanos / Receive", + "includeVars": true, + "keepTime": true, + "title": "Thanos / Receive", + "type": "dashboard" + } + ], + "nullPointMode": "null as zero", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [ ], + "spaceLength": 10, + "span": 4, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "histogram_quantile(0.99, sum by (job, le) (rate(http_request_duration_seconds_bucket{handler=\"receive\"}[$interval])))", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "{{job}} P99", + "legendLink": null, + "step": 10 + } + ], + "thresholds": [ + { + "colorMode": "warning", + "fill": true, + "line": true, + "op": "gt", + "value": 0.5, + "yaxis": "left" + }, + { + "colorMode": "critical", + "fill": true, + "line": true, + "op": "gt", + "value": 1, + "yaxis": "left" + } + ], + "timeFrom": null, + "timeShift": null, + "title": "Incoming Requests Latency 99th Percentile", + "tooltip": { + "shared": false, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [ ] + }, + "yaxes": [ + { + "format": "s", + "label": null, + "logBase": 1, + "max": null, + "min": 0, + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": false + } + ] + } + ], + "repeat": null, + "repeatIteration": null, + "repeatRowId": null, + "showTitle": true, + "title": "Receive", + "titleSize": "h6" + }, + { + "collapse": true, + "height": "250px", + "panels": [ + { + "aliasColors": { }, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "$datasource", + "description": "Shows rate of alerts that successfully sent to alert manager.", + "fill": 10, + "id": 16, + "legend": { + "avg": false, + "current": false, + "max": false, + "min": false, + "show": true, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 0, + "links": [ + { + "dashboard": "Thanos / Rule", + "includeVars": true, + "keepTime": true, + "title": "Thanos / Rule", + "type": "dashboard" + } + ], + "nullPointMode": "null as zero", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [ ], + "spaceLength": 10, + "span": 4, + "stack": true, + "steppedLine": false, + "targets": [ + { + "expr": "sum by (job, alertmanager) (rate(thanos_alert_sender_alerts_sent_total{}[$interval]))", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "{{alertmanager}}", + "legendLink": null, + "step": 10 + } + ], + "thresholds": [ ], + "timeFrom": null, + "timeShift": null, + "title": "Alert Sent Rate", + "tooltip": { + "shared": false, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [ ] + }, + "yaxes": [ + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": 0, + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": false + } + ] + }, + { + "aliasColors": { + "error": "#E24D42" + }, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "$datasource", + "description": "Shows ratio of errors compared to the total number of sent alerts.", + "fill": 10, + "id": 17, + "legend": { + "avg": false, + "current": false, + "max": false, + "min": false, + "show": true, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 0, + "links": [ + { + "dashboard": "Thanos / Rule", + "includeVars": true, + "keepTime": true, + "title": "Thanos / Rule", + "type": "dashboard" + } + ], + "nullPointMode": "null as zero", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [ ], + "spaceLength": 10, + "span": 4, + "stack": true, + "steppedLine": false, + "targets": [ + { + "expr": "sum by (job) (rate(thanos_alert_sender_errors_total{}[$interval])) / sum by (job) (rate(thanos_alert_sender_alerts_sent_total{}[$interval]))", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "error", + "step": 10 + } + ], + "thresholds": [ ], + "timeFrom": null, + "timeShift": null, + "title": "Alert Sent Errors", + "tooltip": { + "shared": false, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [ ] + }, + "yaxes": [ + { + "format": "percentunit", + "label": null, + "logBase": 1, + "max": null, + "min": 0, + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": false + } + ] + }, + { + "aliasColors": { }, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "$datasource", + "description": "Shows how long has it taken to send alerts to alert manager.", + "fill": 1, + "id": 18, + "legend": { + "avg": false, + "current": false, + "max": false, + "min": false, + "show": true, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 1, + "links": [ + { + "dashboard": "Thanos / Rule", + "includeVars": true, + "keepTime": true, + "title": "Thanos / Rule", + "type": "dashboard" + } + ], + "nullPointMode": "null as zero", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [ ], + "spaceLength": 10, + "span": 4, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "histogram_quantile(0.99, sum by (job, le) (rate(thanos_alert_sender_latency_seconds_bucket{}[$interval])))", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "{{job}} P99", + "legendLink": null, + "step": 10 + } + ], + "thresholds": [ + { + "colorMode": "warning", + "fill": true, + "line": true, + "op": "gt", + "value": 0.5, + "yaxis": "left" + }, + { + "colorMode": "critical", + "fill": true, + "line": true, + "op": "gt", + "value": 1, + "yaxis": "left" + } + ], + "timeFrom": null, + "timeShift": null, + "title": "Alert Sent Duration", + "tooltip": { + "shared": false, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [ ] + }, + "yaxes": [ + { + "format": "s", + "label": null, + "logBase": 1, + "max": null, + "min": 0, + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": false + } + ] + } + ], + "repeat": null, + "repeatIteration": null, + "repeatRowId": null, + "showTitle": true, + "title": "Rule", + "titleSize": "h6" + }, + { + "collapse": true, + "height": "250px", + "panels": [ + { + "aliasColors": { }, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "$datasource", + "description": "Shows rate of execution for compactions against blocks that are stored in the bucket by compaction group.", + "fill": 10, + "id": 19, + "legend": { + "avg": false, + "current": false, + "max": false, + "min": false, + "show": true, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 0, + "links": [ + { + "dashboard": "Thanos / Compact", + "includeVars": true, + "keepTime": true, + "title": "Thanos / Compact", + "type": "dashboard" + } + ], + "nullPointMode": "null as zero", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [ ], + "spaceLength": 10, + "span": 6, + "stack": true, + "steppedLine": false, + "targets": [ + { + "expr": "sum by (job) (rate(thanos_compact_group_compactions_total{}[$interval]))", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "compaction {{job}}", + "legendLink": null, + "step": 10 + } + ], + "thresholds": [ ], + "timeFrom": null, + "timeShift": null, + "title": "Compaction Rate", + "tooltip": { + "shared": false, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [ ] + }, + "yaxes": [ + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": 0, + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": false + } + ] + }, + { + "aliasColors": { + "error": "#E24D42" + }, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "$datasource", + "description": "Shows ratio of errors compared to the total number of executed compactions against blocks that are stored in the bucket.", + "fill": 10, + "id": 20, + "legend": { + "avg": false, + "current": false, + "max": false, + "min": false, + "show": true, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 0, + "links": [ + { + "dashboard": "Thanos / Compact", + "includeVars": true, + "keepTime": true, + "title": "Thanos / Compact", + "type": "dashboard" + } + ], + "nullPointMode": "null as zero", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [ ], + "spaceLength": 10, + "span": 6, + "stack": true, + "steppedLine": false, + "targets": [ + { + "expr": "sum by (job) (rate(thanos_compact_group_compactions_failures_total{}[$interval])) / sum by (job) (rate(thanos_compact_group_compactions_total{}[$interval]))", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "error", + "step": 10 + } + ], + "thresholds": [ ], + "timeFrom": null, + "timeShift": null, + "title": "Compaction Errors", + "tooltip": { + "shared": false, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [ ] + }, + "yaxes": [ + { + "format": "percentunit", + "label": null, + "logBase": 1, + "max": null, + "min": 0, + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": false + } + ] + } + ], + "repeat": null, + "repeatIteration": null, + "repeatRowId": null, + "showTitle": true, + "title": "Compact", + "titleSize": "h6" + } + ], + "schemaVersion": 14, + "style": "dark", + "tags": [ + "thanos-mixin", + "portefaix-v0.6.0" + ], + "templating": { + "list": [ + { + "current": { + "text": "default", + "value": "default" + }, + "hide": 0, + "label": null, + "name": "datasource", + "options": [ ], + "query": "prometheus", + "refresh": 1, + "regex": "", + "type": "datasource" + }, + { + "auto": true, + "auto_count": 300, + "auto_min": "10s", + "current": { + "text": "5m", + "value": "5m" + }, + "hide": 0, + "label": "interval", + "name": "interval", + "query": "5m,10m,30m,1h,6h,12h", + "refresh": 2, + "type": "interval" + } + ] + }, + "time": { + "from": "now-1h", + "to": "now" + }, + "timepicker": { + "refresh_intervals": [ + "5s", + "10s", + "30s", + "1m", + "5m", + "15m", + "30m", + "1h", + "2h", + "1d" + ], + "time_options": [ + "5m", + "15m", + "1h", + "6h", + "12h", + "24h", + "2d", + "7d", + "30d" + ] + }, + "timezone": "UTC", + "title": "Thanos / Overview", + "uid": "0cb8830a6e957978796729870f560cda", + "version": 0 +} diff --git a/charts/thanos-mixin/dashboards/query.json b/charts/thanos-mixin/dashboards/query.json new file mode 100644 index 00000000..79fbb2bf --- /dev/null +++ b/charts/thanos-mixin/dashboards/query.json @@ -0,0 +1,1879 @@ +{ + "annotations": { + "list": [ ] + }, + "editable": true, + "gnetId": null, + "graphTooltip": 0, + "hideControls": false, + "links": [ ], + "refresh": "10s", + "rows": [ + { + "collapse": false, + "height": "250px", + "panels": [ + { + "aliasColors": { }, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "$datasource", + "description": "Shows rate of requests against /query for the given time.", + "fill": 10, + "id": 1, + "legend": { + "avg": false, + "current": false, + "max": false, + "min": false, + "show": true, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 0, + "links": [ ], + "nullPointMode": "null as zero", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [ + { + "alias": "/1../", + "color": "#EAB839" + }, + { + "alias": "/2../", + "color": "#37872D" + }, + { + "alias": "/3../", + "color": "#E0B400" + }, + { + "alias": "/4../", + "color": "#1F60C4" + }, + { + "alias": "/5../", + "color": "#C4162A" + } + ], + "spaceLength": 10, + "span": 4, + "stack": true, + "steppedLine": false, + "targets": [ + { + "expr": "sum by (job, handler, code) (rate(http_requests_total{job=~\"$job\", handler=\"query\"}[$interval]))", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "{{job}} {{handler}} {{code}}", + "step": 10 + } + ], + "thresholds": [ ], + "timeFrom": null, + "timeShift": null, + "title": "Rate", + "tooltip": { + "shared": false, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [ ] + }, + "yaxes": [ + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": 0, + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": false + } + ] + }, + { + "aliasColors": { + "error": "#E24D42" + }, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "$datasource", + "description": "Shows ratio of errors compared to the the total number of handled requests against /query.", + "fill": 10, + "id": 2, + "legend": { + "avg": false, + "current": false, + "max": false, + "min": false, + "show": true, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 0, + "links": [ ], + "nullPointMode": "null as zero", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [ ], + "spaceLength": 10, + "span": 4, + "stack": true, + "steppedLine": false, + "targets": [ + { + "expr": "sum by (job) (rate(http_requests_total{job=~\"$job\", handler=\"query\",code=~\"5..\"}[$interval])) / sum by (job) (rate(http_requests_total{job=~\"$job\", handler=\"query\"}[$interval]))", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "error", + "step": 10 + } + ], + "thresholds": [ ], + "timeFrom": null, + "timeShift": null, + "title": "Errors", + "tooltip": { + "shared": false, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [ ] + }, + "yaxes": [ + { + "format": "percentunit", + "label": null, + "logBase": 1, + "max": null, + "min": 0, + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": false + } + ] + }, + { + "aliasColors": { }, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "$datasource", + "description": "Shows how long has it taken to handle requests in quantiles.", + "fill": 1, + "id": 3, + "legend": { + "avg": false, + "current": false, + "max": false, + "min": false, + "show": true, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 1, + "links": [ ], + "nullPointMode": "null as zero", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [ + { + "alias": "p99", + "color": "#FA6400", + "fill": 1, + "fillGradient": 1 + }, + { + "alias": "p90", + "color": "#E0B400", + "fill": 1, + "fillGradient": 1 + }, + { + "alias": "p50", + "color": "#37872D", + "fill": 10, + "fillGradient": 0 + } + ], + "spaceLength": 10, + "span": 4, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "histogram_quantile(0.50, sum by (job, le) (rate(http_request_duration_seconds_bucket{job=~\"$job\", handler=\"query\"}[$interval]))) * 1", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "p50 {{job}}", + "logBase": 10, + "max": null, + "min": null, + "step": 10 + }, + { + "expr": "histogram_quantile(0.90, sum by (job, le) (rate(http_request_duration_seconds_bucket{job=~\"$job\", handler=\"query\"}[$interval]))) * 1", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "p90 {{job}}", + "logBase": 10, + "max": null, + "min": null, + "step": 10 + }, + { + "expr": "histogram_quantile(0.99, sum by (job, le) (rate(http_request_duration_seconds_bucket{job=~\"$job\", handler=\"query\"}[$interval]))) * 1", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "p99 {{job}}", + "logBase": 10, + "max": null, + "min": null, + "step": 10 + } + ], + "thresholds": [ ], + "timeFrom": null, + "timeShift": null, + "title": "Duration", + "tooltip": { + "shared": false, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [ ] + }, + "yaxes": [ + { + "format": "s", + "label": null, + "logBase": 1, + "max": null, + "min": 0, + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": false + } + ] + } + ], + "repeat": null, + "repeatIteration": null, + "repeatRowId": null, + "showTitle": true, + "title": "Instant Query API", + "titleSize": "h6" + }, + { + "collapse": false, + "height": "250px", + "panels": [ + { + "aliasColors": { }, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "$datasource", + "description": "Shows rate of requests against /query_range for the given time range.", + "fill": 10, + "id": 4, + "legend": { + "avg": false, + "current": false, + "max": false, + "min": false, + "show": true, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 0, + "links": [ ], + "nullPointMode": "null as zero", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [ + { + "alias": "/1../", + "color": "#EAB839" + }, + { + "alias": "/2../", + "color": "#37872D" + }, + { + "alias": "/3../", + "color": "#E0B400" + }, + { + "alias": "/4../", + "color": "#1F60C4" + }, + { + "alias": "/5../", + "color": "#C4162A" + } + ], + "spaceLength": 10, + "span": 4, + "stack": true, + "steppedLine": false, + "targets": [ + { + "expr": "sum by (job, handler, code) (rate(http_requests_total{job=~\"$job\", handler=\"query_range\"}[$interval]))", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "{{job}} {{handler}} {{code}}", + "step": 10 + } + ], + "thresholds": [ ], + "timeFrom": null, + "timeShift": null, + "title": "Rate", + "tooltip": { + "shared": false, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [ ] + }, + "yaxes": [ + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": 0, + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": false + } + ] + }, + { + "aliasColors": { + "error": "#E24D42" + }, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "$datasource", + "description": "Shows ratio of errors compared to the the total number of handled requests against /query_range.", + "fill": 10, + "id": 5, + "legend": { + "avg": false, + "current": false, + "max": false, + "min": false, + "show": true, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 0, + "links": [ ], + "nullPointMode": "null as zero", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [ ], + "spaceLength": 10, + "span": 4, + "stack": true, + "steppedLine": false, + "targets": [ + { + "expr": "sum by (job) (rate(http_requests_total{job=~\"$job\", handler=\"query_range\",code=~\"5..\"}[$interval])) / sum by (job) (rate(http_requests_total{job=~\"$job\", handler=\"query_range\"}[$interval]))", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "error", + "step": 10 + } + ], + "thresholds": [ ], + "timeFrom": null, + "timeShift": null, + "title": "Errors", + "tooltip": { + "shared": false, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [ ] + }, + "yaxes": [ + { + "format": "percentunit", + "label": null, + "logBase": 1, + "max": null, + "min": 0, + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": false + } + ] + }, + { + "aliasColors": { }, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "$datasource", + "description": "Shows how long has it taken to handle requests in quantiles.", + "fill": 1, + "id": 6, + "legend": { + "avg": false, + "current": false, + "max": false, + "min": false, + "show": true, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 1, + "links": [ ], + "nullPointMode": "null as zero", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [ + { + "alias": "p99", + "color": "#FA6400", + "fill": 1, + "fillGradient": 1 + }, + { + "alias": "p90", + "color": "#E0B400", + "fill": 1, + "fillGradient": 1 + }, + { + "alias": "p50", + "color": "#37872D", + "fill": 10, + "fillGradient": 0 + } + ], + "spaceLength": 10, + "span": 4, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "histogram_quantile(0.50, sum by (job, le) (rate(http_request_duration_seconds_bucket{job=~\"$job\", handler=\"query_range\"}[$interval]))) * 1", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "p50 {{job}}", + "logBase": 10, + "max": null, + "min": null, + "step": 10 + }, + { + "expr": "histogram_quantile(0.90, sum by (job, le) (rate(http_request_duration_seconds_bucket{job=~\"$job\", handler=\"query_range\"}[$interval]))) * 1", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "p90 {{job}}", + "logBase": 10, + "max": null, + "min": null, + "step": 10 + }, + { + "expr": "histogram_quantile(0.99, sum by (job, le) (rate(http_request_duration_seconds_bucket{job=~\"$job\", handler=\"query_range\"}[$interval]))) * 1", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "p99 {{job}}", + "logBase": 10, + "max": null, + "min": null, + "step": 10 + } + ], + "thresholds": [ ], + "timeFrom": null, + "timeShift": null, + "title": "Duration", + "tooltip": { + "shared": false, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [ ] + }, + "yaxes": [ + { + "format": "s", + "label": null, + "logBase": 1, + "max": null, + "min": 0, + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": false + } + ] + } + ], + "repeat": null, + "repeatIteration": null, + "repeatRowId": null, + "showTitle": true, + "title": "Range Query API", + "titleSize": "h6" + }, + { + "collapse": false, + "height": "250px", + "panels": [ + { + "aliasColors": { }, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "$datasource", + "description": "Shows rate of handled Unary gRPC requests from other queriers.", + "fill": 10, + "id": 7, + "legend": { + "avg": false, + "current": false, + "max": false, + "min": false, + "show": true, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 0, + "links": [ ], + "nullPointMode": "null as zero", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [ + { + "alias": "/Aborted/", + "color": "#EAB839" + }, + { + "alias": "/AlreadyExists/", + "color": "#37872D" + }, + { + "alias": "/FailedPrecondition/", + "color": "#E0B400" + }, + { + "alias": "/Unimplemented/", + "color": "#E0B400" + }, + { + "alias": "/InvalidArgument/", + "color": "#1F60C4" + }, + { + "alias": "/NotFound/", + "color": "#1F60C4" + }, + { + "alias": "/PermissionDenied/", + "color": "#1F60C4" + }, + { + "alias": "/Unauthenticated/", + "color": "#1F60C4" + }, + { + "alias": "/Canceled/", + "color": "#C4162A" + }, + { + "alias": "/DataLoss/", + "color": "#C4162A" + }, + { + "alias": "/DeadlineExceeded/", + "color": "#C4162A" + }, + { + "alias": "/Internal/", + "color": "#C4162A" + }, + { + "alias": "/OutOfRange/", + "color": "#C4162A" + }, + { + "alias": "/ResourceExhausted/", + "color": "#C4162A" + }, + { + "alias": "/Unavailable/", + "color": "#C4162A" + }, + { + "alias": "/Unknown/", + "color": "#C4162A" + }, + { + "alias": "/OK/", + "color": "#37872D" + }, + { + "alias": "error", + "color": "#C4162A" + } + ], + "spaceLength": 10, + "span": 4, + "stack": true, + "steppedLine": false, + "targets": [ + { + "expr": "sum by (job, grpc_method, grpc_code) (rate(grpc_client_handled_total{job=~\"$job\", grpc_type=\"unary\"}[$interval]))", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "{{job}} {{grpc_method}} {{grpc_code}}", + "step": 10 + } + ], + "thresholds": [ ], + "timeFrom": null, + "timeShift": null, + "title": "Rate", + "tooltip": { + "shared": false, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [ ] + }, + "yaxes": [ + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": 0, + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": false + } + ] + }, + { + "aliasColors": { + "error": "#E24D42" + }, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "$datasource", + "description": "Shows ratio of errors compared to the the total number of handled requests from other queriers.", + "fill": 10, + "id": 8, + "legend": { + "avg": false, + "current": false, + "max": false, + "min": false, + "show": true, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 0, + "links": [ ], + "nullPointMode": "null as zero", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [ ], + "spaceLength": 10, + "span": 4, + "stack": true, + "steppedLine": false, + "targets": [ + { + "expr": "sum by (job) (rate(grpc_client_handled_total{grpc_code=~\"Unknown|ResourceExhausted|Internal|Unavailable|DataLoss\",job=~\"$job\", grpc_type=\"unary\"}[$interval])) / sum by (job) (rate(grpc_client_handled_total{job=~\"$job\", grpc_type=\"unary\"}[$interval]))", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "error", + "step": 10 + } + ], + "thresholds": [ ], + "timeFrom": null, + "timeShift": null, + "title": "Errors", + "tooltip": { + "shared": false, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [ ] + }, + "yaxes": [ + { + "format": "percentunit", + "label": null, + "logBase": 1, + "max": null, + "min": 0, + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": false + } + ] + }, + { + "aliasColors": { }, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "$datasource", + "description": "Shows how long has it taken to handle requests from other queriers, in quantiles.", + "fill": 1, + "id": 9, + "legend": { + "avg": false, + "current": false, + "max": false, + "min": false, + "show": true, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 1, + "links": [ ], + "nullPointMode": "null as zero", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [ + { + "alias": "p99", + "color": "#FA6400", + "fill": 1, + "fillGradient": 1 + }, + { + "alias": "p90", + "color": "#E0B400", + "fill": 1, + "fillGradient": 1 + }, + { + "alias": "p50", + "color": "#37872D", + "fill": 10, + "fillGradient": 0 + } + ], + "spaceLength": 10, + "span": 4, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "histogram_quantile(0.50, sum by (job, le) (rate(grpc_client_handling_seconds_bucket{job=~\"$job\", grpc_type=\"unary\"}[$interval]))) * 1", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "p50 {{job}}", + "logBase": 10, + "max": null, + "min": null, + "step": 10 + }, + { + "expr": "histogram_quantile(0.90, sum by (job, le) (rate(grpc_client_handling_seconds_bucket{job=~\"$job\", grpc_type=\"unary\"}[$interval]))) * 1", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "p90 {{job}}", + "logBase": 10, + "max": null, + "min": null, + "step": 10 + }, + { + "expr": "histogram_quantile(0.99, sum by (job, le) (rate(grpc_client_handling_seconds_bucket{job=~\"$job\", grpc_type=\"unary\"}[$interval]))) * 1", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "p99 {{job}}", + "logBase": 10, + "max": null, + "min": null, + "step": 10 + } + ], + "thresholds": [ ], + "timeFrom": null, + "timeShift": null, + "title": "Duration", + "tooltip": { + "shared": false, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [ ] + }, + "yaxes": [ + { + "format": "s", + "label": null, + "logBase": 1, + "max": null, + "min": 0, + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": false + } + ] + } + ], + "repeat": null, + "repeatIteration": null, + "repeatRowId": null, + "showTitle": true, + "title": "gRPC (Unary)", + "titleSize": "h6" + }, + { + "collapse": false, + "height": "250px", + "panels": [ + { + "aliasColors": { }, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "$datasource", + "description": "Shows rate of handled Streamed gRPC requests from other queriers.", + "fill": 10, + "id": 10, + "legend": { + "avg": false, + "current": false, + "max": false, + "min": false, + "show": true, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 0, + "links": [ ], + "nullPointMode": "null as zero", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [ + { + "alias": "/Aborted/", + "color": "#EAB839" + }, + { + "alias": "/AlreadyExists/", + "color": "#37872D" + }, + { + "alias": "/FailedPrecondition/", + "color": "#E0B400" + }, + { + "alias": "/Unimplemented/", + "color": "#E0B400" + }, + { + "alias": "/InvalidArgument/", + "color": "#1F60C4" + }, + { + "alias": "/NotFound/", + "color": "#1F60C4" + }, + { + "alias": "/PermissionDenied/", + "color": "#1F60C4" + }, + { + "alias": "/Unauthenticated/", + "color": "#1F60C4" + }, + { + "alias": "/Canceled/", + "color": "#C4162A" + }, + { + "alias": "/DataLoss/", + "color": "#C4162A" + }, + { + "alias": "/DeadlineExceeded/", + "color": "#C4162A" + }, + { + "alias": "/Internal/", + "color": "#C4162A" + }, + { + "alias": "/OutOfRange/", + "color": "#C4162A" + }, + { + "alias": "/ResourceExhausted/", + "color": "#C4162A" + }, + { + "alias": "/Unavailable/", + "color": "#C4162A" + }, + { + "alias": "/Unknown/", + "color": "#C4162A" + }, + { + "alias": "/OK/", + "color": "#37872D" + }, + { + "alias": "error", + "color": "#C4162A" + } + ], + "spaceLength": 10, + "span": 4, + "stack": true, + "steppedLine": false, + "targets": [ + { + "expr": "sum by (job, grpc_method, grpc_code) (rate(grpc_client_handled_total{job=~\"$job\", grpc_type=\"server_stream\"}[$interval]))", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "{{job}} {{grpc_method}} {{grpc_code}}", + "step": 10 + } + ], + "thresholds": [ ], + "timeFrom": null, + "timeShift": null, + "title": "Rate", + "tooltip": { + "shared": false, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [ ] + }, + "yaxes": [ + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": 0, + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": false + } + ] + }, + { + "aliasColors": { + "error": "#E24D42" + }, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "$datasource", + "description": "Shows ratio of errors compared to the the total number of handled requests from other queriers.", + "fill": 10, + "id": 11, + "legend": { + "avg": false, + "current": false, + "max": false, + "min": false, + "show": true, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 0, + "links": [ ], + "nullPointMode": "null as zero", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [ ], + "spaceLength": 10, + "span": 4, + "stack": true, + "steppedLine": false, + "targets": [ + { + "expr": "sum by (job) (rate(grpc_client_handled_total{grpc_code=~\"Unknown|ResourceExhausted|Internal|Unavailable|DataLoss\",job=~\"$job\", grpc_type=\"server_stream\"}[$interval])) / sum by (job) (rate(grpc_client_handled_total{job=~\"$job\", grpc_type=\"server_stream\"}[$interval]))", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "error", + "step": 10 + } + ], + "thresholds": [ ], + "timeFrom": null, + "timeShift": null, + "title": "Errors", + "tooltip": { + "shared": false, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [ ] + }, + "yaxes": [ + { + "format": "percentunit", + "label": null, + "logBase": 1, + "max": null, + "min": 0, + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": false + } + ] + }, + { + "aliasColors": { }, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "$datasource", + "description": "Shows how long has it taken to handle requests from other queriers, in quantiles", + "fill": 1, + "id": 12, + "legend": { + "avg": false, + "current": false, + "max": false, + "min": false, + "show": true, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 1, + "links": [ ], + "nullPointMode": "null as zero", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [ + { + "alias": "p99", + "color": "#FA6400", + "fill": 1, + "fillGradient": 1 + }, + { + "alias": "p90", + "color": "#E0B400", + "fill": 1, + "fillGradient": 1 + }, + { + "alias": "p50", + "color": "#37872D", + "fill": 10, + "fillGradient": 0 + } + ], + "spaceLength": 10, + "span": 4, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "histogram_quantile(0.50, sum by (job, le) (rate(grpc_client_handling_seconds_bucket{job=~\"$job\", grpc_type=\"server_stream\"}[$interval]))) * 1", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "p50 {{job}}", + "logBase": 10, + "max": null, + "min": null, + "step": 10 + }, + { + "expr": "histogram_quantile(0.90, sum by (job, le) (rate(grpc_client_handling_seconds_bucket{job=~\"$job\", grpc_type=\"server_stream\"}[$interval]))) * 1", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "p90 {{job}}", + "logBase": 10, + "max": null, + "min": null, + "step": 10 + }, + { + "expr": "histogram_quantile(0.99, sum by (job, le) (rate(grpc_client_handling_seconds_bucket{job=~\"$job\", grpc_type=\"server_stream\"}[$interval]))) * 1", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "p99 {{job}}", + "logBase": 10, + "max": null, + "min": null, + "step": 10 + } + ], + "thresholds": [ ], + "timeFrom": null, + "timeShift": null, + "title": "Duration", + "tooltip": { + "shared": false, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [ ] + }, + "yaxes": [ + { + "format": "s", + "label": null, + "logBase": 1, + "max": null, + "min": 0, + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": false + } + ] + } + ], + "repeat": null, + "repeatIteration": null, + "repeatRowId": null, + "showTitle": true, + "title": "gRPC (Stream)", + "titleSize": "h6" + }, + { + "collapse": false, + "height": "250px", + "panels": [ + { + "aliasColors": { }, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "$datasource", + "description": "Shows rate of DNS lookups to discover stores.", + "fill": 1, + "id": 13, + "legend": { + "avg": false, + "current": false, + "max": false, + "min": false, + "show": true, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 1, + "links": [ ], + "nullPointMode": "null as zero", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [ ], + "spaceLength": 10, + "span": 6, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "sum by (job) (rate(thanos_query_store_apis_dns_lookups_total{job=~\"$job\"}[$interval]))", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "lookups {{job}}", + "legendLink": null, + "step": 10 + } + ], + "thresholds": [ ], + "timeFrom": null, + "timeShift": null, + "title": "Rate", + "tooltip": { + "shared": false, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [ ] + }, + "yaxes": [ + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": 0, + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": false + } + ] + }, + { + "aliasColors": { + "error": "#E24D42" + }, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "$datasource", + "description": "Shows ratio of failures compared to the the total number of executed DNS lookups.", + "fill": 10, + "id": 14, + "legend": { + "avg": false, + "current": false, + "max": false, + "min": false, + "show": true, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 0, + "links": [ ], + "nullPointMode": "null as zero", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [ ], + "spaceLength": 10, + "span": 6, + "stack": true, + "steppedLine": false, + "targets": [ + { + "expr": "sum by (job) (rate(thanos_query_store_apis_dns_failures_total{job=~\"$job\"}[$interval])) / sum by (job) (rate(thanos_query_store_apis_dns_lookups_total{job=~\"$job\"}[$interval]))", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "error", + "step": 10 + } + ], + "thresholds": [ ], + "timeFrom": null, + "timeShift": null, + "title": "Errors", + "tooltip": { + "shared": false, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [ ] + }, + "yaxes": [ + { + "format": "percentunit", + "label": null, + "logBase": 1, + "max": null, + "min": 0, + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": false + } + ] + } + ], + "repeat": null, + "repeatIteration": null, + "repeatRowId": null, + "showTitle": true, + "title": "DNS", + "titleSize": "h6" + }, + { + "collapse": true, + "height": "250px", + "panels": [ + { + "aliasColors": { }, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "$datasource", + "fill": 1, + "id": 15, + "legend": { + "avg": false, + "current": false, + "max": false, + "min": false, + "show": true, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 1, + "links": [ ], + "nullPointMode": "null as zero", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [ ], + "spaceLength": 10, + "span": 4, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "go_memstats_alloc_bytes{job=~\"$job\"}", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "alloc all {{instance}}", + "legendLink": null, + "step": 10 + }, + { + "expr": "go_memstats_heap_alloc_bytes{job=~\"$job\"}", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "alloc heap {{instance}}", + "legendLink": null, + "step": 10 + }, + { + "expr": "rate(go_memstats_alloc_bytes_total{job=~\"$job\"}[30s])", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "alloc rate all {{instance}}", + "legendLink": null, + "step": 10 + }, + { + "expr": "rate(go_memstats_heap_alloc_bytes{job=~\"$job\"}[30s])", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "alloc rate heap {{instance}}", + "legendLink": null, + "step": 10 + }, + { + "expr": "go_memstats_stack_inuse_bytes{job=~\"$job\"}", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "inuse heap {{instance}}", + "legendLink": null, + "step": 10 + }, + { + "expr": "go_memstats_heap_inuse_bytes{job=~\"$job\"}", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "inuse stack {{instance}}", + "legendLink": null, + "step": 10 + } + ], + "thresholds": [ ], + "timeFrom": null, + "timeShift": null, + "title": "Memory Used", + "tooltip": { + "shared": false, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [ ] + }, + "yaxes": [ + { + "format": "bytes", + "label": null, + "logBase": 1, + "max": null, + "min": 0, + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": false + } + ] + }, + { + "aliasColors": { }, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "$datasource", + "fill": 1, + "id": 16, + "legend": { + "avg": false, + "current": false, + "max": false, + "min": false, + "show": true, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 1, + "links": [ ], + "nullPointMode": "null as zero", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [ ], + "spaceLength": 10, + "span": 4, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "go_goroutines{job=~\"$job\"}", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "{{instance}}", + "legendLink": null, + "step": 10 + } + ], + "thresholds": [ ], + "timeFrom": null, + "timeShift": null, + "title": "Goroutines", + "tooltip": { + "shared": false, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [ ] + }, + "yaxes": [ + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": 0, + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": false + } + ] + }, + { + "aliasColors": { }, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "$datasource", + "fill": 1, + "id": 17, + "legend": { + "avg": false, + "current": false, + "max": false, + "min": false, + "show": true, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 1, + "links": [ ], + "nullPointMode": "null as zero", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [ ], + "spaceLength": 10, + "span": 4, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "go_gc_duration_seconds{job=~\"$job\"}", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "{{quantile}} {{instance}}", + "legendLink": null, + "step": 10 + } + ], + "thresholds": [ ], + "timeFrom": null, + "timeShift": null, + "title": "GC Time Quantiles", + "tooltip": { + "shared": false, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [ ] + }, + "yaxes": [ + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": 0, + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": false + } + ] + } + ], + "repeat": null, + "repeatIteration": null, + "repeatRowId": null, + "showTitle": true, + "title": "Resources", + "titleSize": "h6" + } + ], + "schemaVersion": 14, + "style": "dark", + "tags": [ + "thanos-mixin", + "portefaix-v0.6.0" + ], + "templating": { + "list": [ + { + "current": { + "text": "default", + "value": "default" + }, + "hide": 0, + "label": null, + "name": "datasource", + "options": [ ], + "query": "prometheus", + "refresh": 1, + "regex": "", + "type": "datasource" + }, + { + "auto": true, + "auto_count": 300, + "auto_min": "10s", + "current": { + "text": "5m", + "value": "5m" + }, + "hide": 0, + "label": "interval", + "name": "interval", + "query": "5m,10m,30m,1h,6h,12h", + "refresh": 2, + "type": "interval" + }, + { + "allValue": null, + "current": { + "text": "all", + "value": "$__all" + }, + "datasource": "$datasource", + "hide": 0, + "includeAll": true, + "label": "job", + "multi": false, + "name": "job", + "options": [ ], + "query": "label_values(up{job=~\".*thanos-query.*\"}, job)", + "refresh": 1, + "regex": "", + "sort": 2, + "tagValuesQuery": "", + "tags": [ ], + "tagsQuery": "", + "type": "query", + "useTags": false + } + ] + }, + "time": { + "from": "now-1h", + "to": "now" + }, + "timepicker": { + "refresh_intervals": [ + "5s", + "10s", + "30s", + "1m", + "5m", + "15m", + "30m", + "1h", + "2h", + "1d" + ], + "time_options": [ + "5m", + "15m", + "1h", + "6h", + "12h", + "24h", + "2d", + "7d", + "30d" + ] + }, + "timezone": "UTC", + "title": "Thanos / Query", + "uid": "af36c91291a603f1d9fbdabdd127ac4a", + "version": 0 +} diff --git a/charts/thanos-mixin/dashboards/receive.json b/charts/thanos-mixin/dashboards/receive.json new file mode 100644 index 00000000..7f55436c --- /dev/null +++ b/charts/thanos-mixin/dashboards/receive.json @@ -0,0 +1,2223 @@ +{ + "annotations": { + "list": [ ] + }, + "editable": true, + "gnetId": null, + "graphTooltip": 0, + "hideControls": false, + "links": [ ], + "refresh": "10s", + "rows": [ + { + "collapse": false, + "height": "250px", + "panels": [ + { + "aliasColors": { }, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "$datasource", + "description": "Shows rate of incoming requests.", + "fill": 10, + "id": 1, + "legend": { + "avg": false, + "current": false, + "max": false, + "min": false, + "show": true, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 0, + "links": [ ], + "nullPointMode": "null as zero", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [ + { + "alias": "/1../", + "color": "#EAB839" + }, + { + "alias": "/2../", + "color": "#37872D" + }, + { + "alias": "/3../", + "color": "#E0B400" + }, + { + "alias": "/4../", + "color": "#1F60C4" + }, + { + "alias": "/5../", + "color": "#C4162A" + } + ], + "spaceLength": 10, + "span": 4, + "stack": true, + "steppedLine": false, + "targets": [ + { + "expr": "sum by (job, handler, code) (rate(http_requests_total{job=\"$job\", handler=\"receive\"}[$interval]))", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "{{job}} {{handler}} {{code}}", + "step": 10 + } + ], + "thresholds": [ ], + "timeFrom": null, + "timeShift": null, + "title": "Rate", + "tooltip": { + "shared": false, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [ ] + }, + "yaxes": [ + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": 0, + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": false + } + ] + }, + { + "aliasColors": { + "error": "#E24D42" + }, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "$datasource", + "description": "Shows ratio of errors compared to the total number of handled incoming requests.", + "fill": 10, + "id": 2, + "legend": { + "avg": false, + "current": false, + "max": false, + "min": false, + "show": true, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 0, + "links": [ ], + "nullPointMode": "null as zero", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [ ], + "spaceLength": 10, + "span": 4, + "stack": true, + "steppedLine": false, + "targets": [ + { + "expr": "sum by (job) (rate(http_requests_total{job=\"$job\", handler=\"receive\",code=~\"5..\"}[$interval])) / sum by (job) (rate(http_requests_total{job=\"$job\", handler=\"receive\"}[$interval]))", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "error", + "step": 10 + } + ], + "thresholds": [ ], + "timeFrom": null, + "timeShift": null, + "title": "Errors", + "tooltip": { + "shared": false, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [ ] + }, + "yaxes": [ + { + "format": "percentunit", + "label": null, + "logBase": 1, + "max": null, + "min": 0, + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": false + } + ] + }, + { + "aliasColors": { }, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "$datasource", + "description": "Shows how long has it taken to handle incoming requests in quantiles.", + "fill": 1, + "id": 3, + "legend": { + "avg": false, + "current": false, + "max": false, + "min": false, + "show": true, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 1, + "links": [ ], + "nullPointMode": "null as zero", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [ + { + "alias": "p99", + "color": "#FA6400", + "fill": 1, + "fillGradient": 1 + }, + { + "alias": "p90", + "color": "#E0B400", + "fill": 1, + "fillGradient": 1 + }, + { + "alias": "p50", + "color": "#37872D", + "fill": 10, + "fillGradient": 0 + } + ], + "spaceLength": 10, + "span": 4, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "histogram_quantile(0.50, sum by (job, le) (rate(http_request_duration_seconds_bucket{job=\"$job\", handler=\"receive\"}[$interval]))) * 1", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "p50 {{job}}", + "logBase": 10, + "max": null, + "min": null, + "step": 10 + }, + { + "expr": "histogram_quantile(0.90, sum by (job, le) (rate(http_request_duration_seconds_bucket{job=\"$job\", handler=\"receive\"}[$interval]))) * 1", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "p90 {{job}}", + "logBase": 10, + "max": null, + "min": null, + "step": 10 + }, + { + "expr": "histogram_quantile(0.99, sum by (job, le) (rate(http_request_duration_seconds_bucket{job=\"$job\", handler=\"receive\"}[$interval]))) * 1", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "p99 {{job}}", + "logBase": 10, + "max": null, + "min": null, + "step": 10 + } + ], + "thresholds": [ ], + "timeFrom": null, + "timeShift": null, + "title": "Duration", + "tooltip": { + "shared": false, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [ ] + }, + "yaxes": [ + { + "format": "s", + "label": null, + "logBase": 1, + "max": null, + "min": 0, + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": false + } + ] + } + ], + "repeat": null, + "repeatIteration": null, + "repeatRowId": null, + "showTitle": true, + "title": "WRITE - Incoming Request", + "titleSize": "h6" + }, + { + "collapse": false, + "height": "250px", + "panels": [ + { + "aliasColors": { }, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "$datasource", + "description": "Shows rate of replications to other receive nodes.", + "fill": 1, + "id": 4, + "legend": { + "avg": false, + "current": false, + "max": false, + "min": false, + "show": true, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 1, + "links": [ ], + "nullPointMode": "null as zero", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [ ], + "spaceLength": 10, + "span": 6, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "sum by (job) (rate(thanos_receive_replications_total{job=\"$job\"}[$interval]))", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "all {{job}}", + "legendLink": null, + "step": 10 + } + ], + "thresholds": [ ], + "timeFrom": null, + "timeShift": null, + "title": "Rate", + "tooltip": { + "shared": false, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [ ] + }, + "yaxes": [ + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": 0, + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": false + } + ] + }, + { + "aliasColors": { + "error": "#E24D42" + }, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "$datasource", + "description": "Shows ratio of errors compared to the total number of replications to other receive nodes.", + "fill": 10, + "id": 5, + "legend": { + "avg": false, + "current": false, + "max": false, + "min": false, + "show": true, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 0, + "links": [ ], + "nullPointMode": "null as zero", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [ ], + "spaceLength": 10, + "span": 6, + "stack": true, + "steppedLine": false, + "targets": [ + { + "expr": "sum by (job) (rate(thanos_receive_replications_total{job=\"$job\", result=\"error\"}[$interval])) / sum by (job) (rate(thanos_receive_replications_total{job=\"$job\"}[$interval]))", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "error", + "step": 10 + } + ], + "thresholds": [ ], + "timeFrom": null, + "timeShift": null, + "title": "Errors", + "tooltip": { + "shared": false, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [ ] + }, + "yaxes": [ + { + "format": "percentunit", + "label": null, + "logBase": 1, + "max": null, + "min": 0, + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": false + } + ] + } + ], + "repeat": null, + "repeatIteration": null, + "repeatRowId": null, + "showTitle": true, + "title": "WRITE - Replication", + "titleSize": "h6" + }, + { + "collapse": false, + "height": "250px", + "panels": [ + { + "aliasColors": { }, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "$datasource", + "description": "Shows rate of forwarded requests to other receive nodes.", + "fill": 1, + "id": 6, + "legend": { + "avg": false, + "current": false, + "max": false, + "min": false, + "show": true, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 1, + "links": [ ], + "nullPointMode": "null as zero", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [ ], + "spaceLength": 10, + "span": 6, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "sum by (job) (rate(thanos_receive_forward_requests_total{job=\"$job\"}[$interval]))", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "all {{job}}", + "legendLink": null, + "step": 10 + } + ], + "thresholds": [ ], + "timeFrom": null, + "timeShift": null, + "title": "Rate", + "tooltip": { + "shared": false, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [ ] + }, + "yaxes": [ + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": 0, + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": false + } + ] + }, + { + "aliasColors": { + "error": "#E24D42" + }, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "$datasource", + "description": "Shows ratio of errors compared to the total number of forwareded requests to other receive nodes.", + "fill": 10, + "id": 7, + "legend": { + "avg": false, + "current": false, + "max": false, + "min": false, + "show": true, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 0, + "links": [ ], + "nullPointMode": "null as zero", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [ ], + "spaceLength": 10, + "span": 6, + "stack": true, + "steppedLine": false, + "targets": [ + { + "expr": "sum by (job) (rate(thanos_receive_forward_requests_total{job=\"$job\", result=\"error\"}[$interval])) / sum by (job) (rate(thanos_receive_forward_requests_total{job=\"$job\"}[$interval]))", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "error", + "step": 10 + } + ], + "thresholds": [ ], + "timeFrom": null, + "timeShift": null, + "title": "Errors", + "tooltip": { + "shared": false, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [ ] + }, + "yaxes": [ + { + "format": "percentunit", + "label": null, + "logBase": 1, + "max": null, + "min": 0, + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": false + } + ] + } + ], + "repeat": null, + "repeatIteration": null, + "repeatRowId": null, + "showTitle": true, + "title": "WRITE - Forward Request", + "titleSize": "h6" + }, + { + "collapse": false, + "height": "250px", + "panels": [ + { + "aliasColors": { }, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "$datasource", + "description": "Shows rate of handled Unary gRPC requests from queriers.", + "fill": 10, + "id": 8, + "legend": { + "avg": false, + "current": false, + "max": false, + "min": false, + "show": true, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 0, + "links": [ ], + "nullPointMode": "null as zero", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [ + { + "alias": "/Aborted/", + "color": "#EAB839" + }, + { + "alias": "/AlreadyExists/", + "color": "#37872D" + }, + { + "alias": "/FailedPrecondition/", + "color": "#E0B400" + }, + { + "alias": "/Unimplemented/", + "color": "#E0B400" + }, + { + "alias": "/InvalidArgument/", + "color": "#1F60C4" + }, + { + "alias": "/NotFound/", + "color": "#1F60C4" + }, + { + "alias": "/PermissionDenied/", + "color": "#1F60C4" + }, + { + "alias": "/Unauthenticated/", + "color": "#1F60C4" + }, + { + "alias": "/Canceled/", + "color": "#C4162A" + }, + { + "alias": "/DataLoss/", + "color": "#C4162A" + }, + { + "alias": "/DeadlineExceeded/", + "color": "#C4162A" + }, + { + "alias": "/Internal/", + "color": "#C4162A" + }, + { + "alias": "/OutOfRange/", + "color": "#C4162A" + }, + { + "alias": "/ResourceExhausted/", + "color": "#C4162A" + }, + { + "alias": "/Unavailable/", + "color": "#C4162A" + }, + { + "alias": "/Unknown/", + "color": "#C4162A" + }, + { + "alias": "/OK/", + "color": "#37872D" + }, + { + "alias": "error", + "color": "#C4162A" + } + ], + "spaceLength": 10, + "span": 4, + "stack": true, + "steppedLine": false, + "targets": [ + { + "expr": "sum by (job, grpc_method, grpc_code) (rate(grpc_server_handled_total{job=\"$job\", grpc_type=\"unary\", grpc_method=\"RemoteWrite\"}[$interval]))", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "{{job}} {{grpc_method}} {{grpc_code}}", + "step": 10 + } + ], + "thresholds": [ ], + "timeFrom": null, + "timeShift": null, + "title": "Rate", + "tooltip": { + "shared": false, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [ ] + }, + "yaxes": [ + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": 0, + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": false + } + ] + }, + { + "aliasColors": { + "error": "#E24D42" + }, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "$datasource", + "description": "Shows ratio of errors compared to the total number of handled requests from queriers.", + "fill": 10, + "id": 9, + "legend": { + "avg": false, + "current": false, + "max": false, + "min": false, + "show": true, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 0, + "links": [ ], + "nullPointMode": "null as zero", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [ ], + "spaceLength": 10, + "span": 4, + "stack": true, + "steppedLine": false, + "targets": [ + { + "expr": "sum by (job) (rate(grpc_server_handled_total{grpc_code=~\"Unknown|ResourceExhausted|Internal|Unavailable|DataLoss\",job=\"$job\", grpc_type=\"unary\", grpc_method=\"RemoteWrite\"}[$interval])) / sum by (job) (rate(grpc_server_handled_total{job=\"$job\", grpc_type=\"unary\", grpc_method=\"RemoteWrite\"}[$interval]))", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "error", + "step": 10 + } + ], + "thresholds": [ ], + "timeFrom": null, + "timeShift": null, + "title": "Errors", + "tooltip": { + "shared": false, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [ ] + }, + "yaxes": [ + { + "format": "percentunit", + "label": null, + "logBase": 1, + "max": null, + "min": 0, + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": false + } + ] + }, + { + "aliasColors": { }, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "$datasource", + "description": "Shows how long has it taken to handle requests from queriers, in quantiles.", + "fill": 1, + "id": 10, + "legend": { + "avg": false, + "current": false, + "max": false, + "min": false, + "show": true, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 1, + "links": [ ], + "nullPointMode": "null as zero", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [ + { + "alias": "p99", + "color": "#FA6400", + "fill": 1, + "fillGradient": 1 + }, + { + "alias": "p90", + "color": "#E0B400", + "fill": 1, + "fillGradient": 1 + }, + { + "alias": "p50", + "color": "#37872D", + "fill": 10, + "fillGradient": 0 + } + ], + "spaceLength": 10, + "span": 4, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "histogram_quantile(0.50, sum by (job, le) (rate(grpc_server_handling_seconds_bucket{job=\"$job\", grpc_type=\"unary\", grpc_method=\"RemoteWrite\"}[$interval]))) * 1", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "p50 {{job}}", + "logBase": 10, + "max": null, + "min": null, + "step": 10 + }, + { + "expr": "histogram_quantile(0.90, sum by (job, le) (rate(grpc_server_handling_seconds_bucket{job=\"$job\", grpc_type=\"unary\", grpc_method=\"RemoteWrite\"}[$interval]))) * 1", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "p90 {{job}}", + "logBase": 10, + "max": null, + "min": null, + "step": 10 + }, + { + "expr": "histogram_quantile(0.99, sum by (job, le) (rate(grpc_server_handling_seconds_bucket{job=\"$job\", grpc_type=\"unary\", grpc_method=\"RemoteWrite\"}[$interval]))) * 1", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "p99 {{job}}", + "logBase": 10, + "max": null, + "min": null, + "step": 10 + } + ], + "thresholds": [ ], + "timeFrom": null, + "timeShift": null, + "title": "Duration", + "tooltip": { + "shared": false, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [ ] + }, + "yaxes": [ + { + "format": "s", + "label": null, + "logBase": 1, + "max": null, + "min": 0, + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": false + } + ] + } + ], + "repeat": null, + "repeatIteration": null, + "repeatRowId": null, + "showTitle": true, + "title": "WRITE - gRPC (Unary)", + "titleSize": "h6" + }, + { + "collapse": false, + "height": "250px", + "panels": [ + { + "aliasColors": { }, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "$datasource", + "description": "Shows rate of handled Unary gRPC requests from queriers.", + "fill": 10, + "id": 11, + "legend": { + "avg": false, + "current": false, + "max": false, + "min": false, + "show": true, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 0, + "links": [ ], + "nullPointMode": "null as zero", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [ + { + "alias": "/Aborted/", + "color": "#EAB839" + }, + { + "alias": "/AlreadyExists/", + "color": "#37872D" + }, + { + "alias": "/FailedPrecondition/", + "color": "#E0B400" + }, + { + "alias": "/Unimplemented/", + "color": "#E0B400" + }, + { + "alias": "/InvalidArgument/", + "color": "#1F60C4" + }, + { + "alias": "/NotFound/", + "color": "#1F60C4" + }, + { + "alias": "/PermissionDenied/", + "color": "#1F60C4" + }, + { + "alias": "/Unauthenticated/", + "color": "#1F60C4" + }, + { + "alias": "/Canceled/", + "color": "#C4162A" + }, + { + "alias": "/DataLoss/", + "color": "#C4162A" + }, + { + "alias": "/DeadlineExceeded/", + "color": "#C4162A" + }, + { + "alias": "/Internal/", + "color": "#C4162A" + }, + { + "alias": "/OutOfRange/", + "color": "#C4162A" + }, + { + "alias": "/ResourceExhausted/", + "color": "#C4162A" + }, + { + "alias": "/Unavailable/", + "color": "#C4162A" + }, + { + "alias": "/Unknown/", + "color": "#C4162A" + }, + { + "alias": "/OK/", + "color": "#37872D" + }, + { + "alias": "error", + "color": "#C4162A" + } + ], + "spaceLength": 10, + "span": 4, + "stack": true, + "steppedLine": false, + "targets": [ + { + "expr": "sum by (job, grpc_method, grpc_code) (rate(grpc_server_handled_total{job=\"$job\", grpc_type=\"unary\", grpc_method!=\"RemoteWrite\"}[$interval]))", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "{{job}} {{grpc_method}} {{grpc_code}}", + "step": 10 + } + ], + "thresholds": [ ], + "timeFrom": null, + "timeShift": null, + "title": "Rate", + "tooltip": { + "shared": false, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [ ] + }, + "yaxes": [ + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": 0, + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": false + } + ] + }, + { + "aliasColors": { + "error": "#E24D42" + }, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "$datasource", + "description": "Shows ratio of errors compared to the total number of handled requests from queriers.", + "fill": 10, + "id": 12, + "legend": { + "avg": false, + "current": false, + "max": false, + "min": false, + "show": true, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 0, + "links": [ ], + "nullPointMode": "null as zero", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [ ], + "spaceLength": 10, + "span": 4, + "stack": true, + "steppedLine": false, + "targets": [ + { + "expr": "sum by (job) (rate(grpc_server_handled_total{grpc_code=~\"Unknown|ResourceExhausted|Internal|Unavailable|DataLoss\",job=\"$job\", grpc_type=\"unary\", grpc_method!=\"RemoteWrite\"}[$interval])) / sum by (job) (rate(grpc_server_handled_total{job=\"$job\", grpc_type=\"unary\", grpc_method!=\"RemoteWrite\"}[$interval]))", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "error", + "step": 10 + } + ], + "thresholds": [ ], + "timeFrom": null, + "timeShift": null, + "title": "Errors", + "tooltip": { + "shared": false, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [ ] + }, + "yaxes": [ + { + "format": "percentunit", + "label": null, + "logBase": 1, + "max": null, + "min": 0, + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": false + } + ] + }, + { + "aliasColors": { }, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "$datasource", + "description": "Shows how long has it taken to handle requests from queriers, in quantiles.", + "fill": 1, + "id": 13, + "legend": { + "avg": false, + "current": false, + "max": false, + "min": false, + "show": true, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 1, + "links": [ ], + "nullPointMode": "null as zero", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [ + { + "alias": "p99", + "color": "#FA6400", + "fill": 1, + "fillGradient": 1 + }, + { + "alias": "p90", + "color": "#E0B400", + "fill": 1, + "fillGradient": 1 + }, + { + "alias": "p50", + "color": "#37872D", + "fill": 10, + "fillGradient": 0 + } + ], + "spaceLength": 10, + "span": 4, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "histogram_quantile(0.50, sum by (job, le) (rate(grpc_server_handling_seconds_bucket{job=\"$job\", grpc_type=\"unary\", grpc_method!=\"RemoteWrite\"}[$interval]))) * 1", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "p50 {{job}}", + "logBase": 10, + "max": null, + "min": null, + "step": 10 + }, + { + "expr": "histogram_quantile(0.90, sum by (job, le) (rate(grpc_server_handling_seconds_bucket{job=\"$job\", grpc_type=\"unary\", grpc_method!=\"RemoteWrite\"}[$interval]))) * 1", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "p90 {{job}}", + "logBase": 10, + "max": null, + "min": null, + "step": 10 + }, + { + "expr": "histogram_quantile(0.99, sum by (job, le) (rate(grpc_server_handling_seconds_bucket{job=\"$job\", grpc_type=\"unary\", grpc_method!=\"RemoteWrite\"}[$interval]))) * 1", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "p99 {{job}}", + "logBase": 10, + "max": null, + "min": null, + "step": 10 + } + ], + "thresholds": [ ], + "timeFrom": null, + "timeShift": null, + "title": "Duration", + "tooltip": { + "shared": false, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [ ] + }, + "yaxes": [ + { + "format": "s", + "label": null, + "logBase": 1, + "max": null, + "min": 0, + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": false + } + ] + } + ], + "repeat": null, + "repeatIteration": null, + "repeatRowId": null, + "showTitle": true, + "title": "READ - gRPC (Unary)", + "titleSize": "h6" + }, + { + "collapse": false, + "height": "250px", + "panels": [ + { + "aliasColors": { }, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "$datasource", + "description": "Shows rate of handled Streamed gRPC requests from queriers.", + "fill": 10, + "id": 14, + "legend": { + "avg": false, + "current": false, + "max": false, + "min": false, + "show": true, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 0, + "links": [ ], + "nullPointMode": "null as zero", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [ + { + "alias": "/Aborted/", + "color": "#EAB839" + }, + { + "alias": "/AlreadyExists/", + "color": "#37872D" + }, + { + "alias": "/FailedPrecondition/", + "color": "#E0B400" + }, + { + "alias": "/Unimplemented/", + "color": "#E0B400" + }, + { + "alias": "/InvalidArgument/", + "color": "#1F60C4" + }, + { + "alias": "/NotFound/", + "color": "#1F60C4" + }, + { + "alias": "/PermissionDenied/", + "color": "#1F60C4" + }, + { + "alias": "/Unauthenticated/", + "color": "#1F60C4" + }, + { + "alias": "/Canceled/", + "color": "#C4162A" + }, + { + "alias": "/DataLoss/", + "color": "#C4162A" + }, + { + "alias": "/DeadlineExceeded/", + "color": "#C4162A" + }, + { + "alias": "/Internal/", + "color": "#C4162A" + }, + { + "alias": "/OutOfRange/", + "color": "#C4162A" + }, + { + "alias": "/ResourceExhausted/", + "color": "#C4162A" + }, + { + "alias": "/Unavailable/", + "color": "#C4162A" + }, + { + "alias": "/Unknown/", + "color": "#C4162A" + }, + { + "alias": "/OK/", + "color": "#37872D" + }, + { + "alias": "error", + "color": "#C4162A" + } + ], + "spaceLength": 10, + "span": 4, + "stack": true, + "steppedLine": false, + "targets": [ + { + "expr": "sum by (job, grpc_method, grpc_code) (rate(grpc_server_handled_total{job=\"$job\", grpc_type=\"server_stream\"}[$interval]))", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "{{job}} {{grpc_method}} {{grpc_code}}", + "step": 10 + } + ], + "thresholds": [ ], + "timeFrom": null, + "timeShift": null, + "title": "Rate", + "tooltip": { + "shared": false, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [ ] + }, + "yaxes": [ + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": 0, + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": false + } + ] + }, + { + "aliasColors": { + "error": "#E24D42" + }, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "$datasource", + "description": "Shows ratio of errors compared to the total number of handled requests from queriers.", + "fill": 10, + "id": 15, + "legend": { + "avg": false, + "current": false, + "max": false, + "min": false, + "show": true, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 0, + "links": [ ], + "nullPointMode": "null as zero", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [ ], + "spaceLength": 10, + "span": 4, + "stack": true, + "steppedLine": false, + "targets": [ + { + "expr": "sum by (job) (rate(grpc_server_handled_total{grpc_code=~\"Unknown|ResourceExhausted|Internal|Unavailable|DataLoss\",job=\"$job\", grpc_type=\"server_stream\"}[$interval])) / sum by (job) (rate(grpc_server_handled_total{job=\"$job\", grpc_type=\"server_stream\"}[$interval]))", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "error", + "step": 10 + } + ], + "thresholds": [ ], + "timeFrom": null, + "timeShift": null, + "title": "Errors", + "tooltip": { + "shared": false, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [ ] + }, + "yaxes": [ + { + "format": "percentunit", + "label": null, + "logBase": 1, + "max": null, + "min": 0, + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": false + } + ] + }, + { + "aliasColors": { }, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "$datasource", + "description": "Shows how long has it taken to handle requests from queriers, in quantiles.", + "fill": 1, + "id": 16, + "legend": { + "avg": false, + "current": false, + "max": false, + "min": false, + "show": true, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 1, + "links": [ ], + "nullPointMode": "null as zero", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [ + { + "alias": "p99", + "color": "#FA6400", + "fill": 1, + "fillGradient": 1 + }, + { + "alias": "p90", + "color": "#E0B400", + "fill": 1, + "fillGradient": 1 + }, + { + "alias": "p50", + "color": "#37872D", + "fill": 10, + "fillGradient": 0 + } + ], + "spaceLength": 10, + "span": 4, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "histogram_quantile(0.50, sum by (job, le) (rate(grpc_server_handling_seconds_bucket{job=\"$job\", grpc_type=\"server_stream\"}[$interval]))) * 1", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "p50 {{job}}", + "logBase": 10, + "max": null, + "min": null, + "step": 10 + }, + { + "expr": "histogram_quantile(0.90, sum by (job, le) (rate(grpc_server_handling_seconds_bucket{job=\"$job\", grpc_type=\"server_stream\"}[$interval]))) * 1", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "p90 {{job}}", + "logBase": 10, + "max": null, + "min": null, + "step": 10 + }, + { + "expr": "histogram_quantile(0.99, sum by (job, le) (rate(grpc_server_handling_seconds_bucket{job=\"$job\", grpc_type=\"server_stream\"}[$interval]))) * 1", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "p99 {{job}}", + "logBase": 10, + "max": null, + "min": null, + "step": 10 + } + ], + "thresholds": [ ], + "timeFrom": null, + "timeShift": null, + "title": "Duration", + "tooltip": { + "shared": false, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [ ] + }, + "yaxes": [ + { + "format": "s", + "label": null, + "logBase": 1, + "max": null, + "min": 0, + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": false + } + ] + } + ], + "repeat": null, + "repeatIteration": null, + "repeatRowId": null, + "showTitle": true, + "title": "READ - gRPC (Stream)", + "titleSize": "h6" + }, + { + "collapse": false, + "height": "250px", + "panels": [ + { + "aliasColors": { }, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "$datasource", + "description": "Shows the relative time of last successful upload to the object-store bucket.", + "fill": 1, + "id": 17, + "legend": { + "avg": false, + "current": false, + "max": false, + "min": false, + "show": true, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 1, + "links": [ ], + "nullPointMode": "null as zero", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [ ], + "spaceLength": 10, + "span": 12, + "stack": false, + "steppedLine": false, + "styles": [ + { + "alias": "Time", + "dateFormat": "YYYY-MM-DD HH:mm:ss", + "pattern": "Time", + "type": "hidden" + }, + { + "alias": "Uploaded Ago", + "colorMode": null, + "colors": [ ], + "dateFormat": "YYYY-MM-DD HH:mm:ss", + "decimals": 2, + "link": false, + "linkTargetBlank": false, + "linkTooltip": "Drill down", + "linkUrl": "", + "pattern": "Value", + "thresholds": [ ], + "type": "number", + "unit": "s" + }, + { + "alias": "", + "colorMode": null, + "colors": [ ], + "dateFormat": "YYYY-MM-DD HH:mm:ss", + "decimals": 2, + "pattern": "/.*/", + "thresholds": [ ], + "type": "string", + "unit": "short" + } + ], + "targets": [ + { + "expr": "time() - max by (job, bucket) (thanos_objstore_bucket_last_successful_upload_time{job=\"$job\"})", + "format": "table", + "instant": true, + "intervalFactor": 2, + "legendFormat": "", + "refId": "A", + "step": 10 + } + ], + "thresholds": [ ], + "timeFrom": null, + "timeShift": null, + "title": "Successful Upload", + "tooltip": { + "shared": false, + "sort": 0, + "value_type": "individual" + }, + "transform": "table", + "type": "table", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [ ] + }, + "yaxes": [ + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": 0, + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": false + } + ] + } + ], + "repeat": null, + "repeatIteration": null, + "repeatRowId": null, + "showTitle": true, + "title": "Last Updated", + "titleSize": "h6" + }, + { + "collapse": true, + "height": "250px", + "panels": [ + { + "aliasColors": { }, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "$datasource", + "fill": 1, + "id": 18, + "legend": { + "avg": false, + "current": false, + "max": false, + "min": false, + "show": true, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 1, + "links": [ ], + "nullPointMode": "null as zero", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [ ], + "spaceLength": 10, + "span": 4, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "go_memstats_alloc_bytes{job=\"$job\"}", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "alloc all {{instance}}", + "legendLink": null, + "step": 10 + }, + { + "expr": "go_memstats_heap_alloc_bytes{job=\"$job\"}", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "alloc heap {{instance}}", + "legendLink": null, + "step": 10 + }, + { + "expr": "rate(go_memstats_alloc_bytes_total{job=\"$job\"}[30s])", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "alloc rate all {{instance}}", + "legendLink": null, + "step": 10 + }, + { + "expr": "rate(go_memstats_heap_alloc_bytes{job=\"$job\"}[30s])", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "alloc rate heap {{instance}}", + "legendLink": null, + "step": 10 + }, + { + "expr": "go_memstats_stack_inuse_bytes{job=\"$job\"}", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "inuse heap {{instance}}", + "legendLink": null, + "step": 10 + }, + { + "expr": "go_memstats_heap_inuse_bytes{job=\"$job\"}", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "inuse stack {{instance}}", + "legendLink": null, + "step": 10 + } + ], + "thresholds": [ ], + "timeFrom": null, + "timeShift": null, + "title": "Memory Used", + "tooltip": { + "shared": false, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [ ] + }, + "yaxes": [ + { + "format": "bytes", + "label": null, + "logBase": 1, + "max": null, + "min": 0, + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": false + } + ] + }, + { + "aliasColors": { }, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "$datasource", + "fill": 1, + "id": 19, + "legend": { + "avg": false, + "current": false, + "max": false, + "min": false, + "show": true, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 1, + "links": [ ], + "nullPointMode": "null as zero", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [ ], + "spaceLength": 10, + "span": 4, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "go_goroutines{job=\"$job\"}", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "{{instance}}", + "legendLink": null, + "step": 10 + } + ], + "thresholds": [ ], + "timeFrom": null, + "timeShift": null, + "title": "Goroutines", + "tooltip": { + "shared": false, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [ ] + }, + "yaxes": [ + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": 0, + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": false + } + ] + }, + { + "aliasColors": { }, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "$datasource", + "fill": 1, + "id": 20, + "legend": { + "avg": false, + "current": false, + "max": false, + "min": false, + "show": true, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 1, + "links": [ ], + "nullPointMode": "null as zero", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [ ], + "spaceLength": 10, + "span": 4, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "go_gc_duration_seconds{job=\"$job\"}", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "{{quantile}} {{instance}}", + "legendLink": null, + "step": 10 + } + ], + "thresholds": [ ], + "timeFrom": null, + "timeShift": null, + "title": "GC Time Quantiles", + "tooltip": { + "shared": false, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [ ] + }, + "yaxes": [ + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": 0, + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": false + } + ] + } + ], + "repeat": null, + "repeatIteration": null, + "repeatRowId": null, + "showTitle": true, + "title": "Resources", + "titleSize": "h6" + } + ], + "schemaVersion": 14, + "style": "dark", + "tags": [ + "thanos-mixin", + "portefaix-v0.6.0" + ], + "templating": { + "list": [ + { + "current": { + "text": "default", + "value": "default" + }, + "hide": 0, + "label": null, + "name": "datasource", + "options": [ ], + "query": "prometheus", + "refresh": 1, + "regex": "", + "type": "datasource" + }, + { + "auto": true, + "auto_count": 300, + "auto_min": "10s", + "current": { + "text": "5m", + "value": "5m" + }, + "hide": 0, + "label": "interval", + "name": "interval", + "query": "5m,10m,30m,1h,6h,12h", + "refresh": 2, + "type": "interval" + }, + { + "allValue": null, + "current": { + "text": "all", + "value": "$__all" + }, + "datasource": "$datasource", + "hide": 0, + "includeAll": true, + "label": "job", + "multi": false, + "name": "job", + "options": [ ], + "query": "label_values(up{job=~\".*thanos-receive.*\"}, job)", + "refresh": 1, + "regex": "", + "sort": 2, + "tagValuesQuery": "", + "tags": [ ], + "tagsQuery": "", + "type": "query", + "useTags": false + } + ] + }, + "time": { + "from": "now-1h", + "to": "now" + }, + "timepicker": { + "refresh_intervals": [ + "5s", + "10s", + "30s", + "1m", + "5m", + "15m", + "30m", + "1h", + "2h", + "1d" + ], + "time_options": [ + "5m", + "15m", + "1h", + "6h", + "12h", + "24h", + "2d", + "7d", + "30d" + ] + }, + "timezone": "UTC", + "title": "Thanos / Receive", + "uid": "916a852b00ccc5ed81056644718fa4fb", + "version": 0 +} diff --git a/charts/thanos-mixin/dashboards/rule.json b/charts/thanos-mixin/dashboards/rule.json new file mode 100644 index 00000000..3011de6c --- /dev/null +++ b/charts/thanos-mixin/dashboards/rule.json @@ -0,0 +1,1871 @@ +{ + "annotations": { + "list": [ ] + }, + "editable": true, + "gnetId": null, + "graphTooltip": 0, + "hideControls": false, + "links": [ ], + "refresh": "10s", + "rows": [ + { + "collapse": false, + "height": "250px", + "panels": [ + { + "aliasColors": { }, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "$datasource", + "fill": 1, + "id": 1, + "legend": { + "avg": false, + "current": false, + "max": false, + "min": false, + "show": true, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 1, + "links": [ ], + "nullPointMode": "null as zero", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [ ], + "spaceLength": 10, + "span": 4, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "sum by (job, strategy) (rate(prometheus_rule_evaluations_total{job=\"$job\"}[$interval]))", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "{{ strategy }}", + "legendLink": null, + "step": 10 + } + ], + "thresholds": [ ], + "timeFrom": null, + "timeShift": null, + "title": "Rule Group Evaluations", + "tooltip": { + "shared": false, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [ ] + }, + "yaxes": [ + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": 0, + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": false + } + ] + }, + { + "aliasColors": { }, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "$datasource", + "fill": 1, + "id": 2, + "legend": { + "avg": false, + "current": false, + "max": false, + "min": false, + "show": true, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 1, + "links": [ ], + "nullPointMode": "null as zero", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [ ], + "spaceLength": 10, + "span": 4, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "sum by (job, strategy) (increase(prometheus_rule_group_iterations_missed_total{job=\"$job\"}[$interval]))", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "{{ strategy }}", + "legendLink": null, + "step": 10 + } + ], + "thresholds": [ ], + "timeFrom": null, + "timeShift": null, + "title": "Rule Group Evaluations Missed", + "tooltip": { + "shared": false, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [ ] + }, + "yaxes": [ + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": 0, + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": false + } + ] + }, + { + "aliasColors": { }, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "$datasource", + "fill": 1, + "id": 3, + "legend": { + "avg": false, + "current": false, + "max": false, + "min": false, + "show": true, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 1, + "links": [ ], + "nullPointMode": "null as zero", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [ ], + "spaceLength": 10, + "span": 4, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "(\n max by(job, rule_group) (prometheus_rule_group_last_duration_seconds{job=\"$job\"})\n >\n sum by(job, rule_group) (prometheus_rule_group_interval_seconds{job=\"$job\"})\n)\n", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "{{ rule_group }}", + "legendLink": null, + "step": 10 + } + ], + "thresholds": [ ], + "timeFrom": null, + "timeShift": null, + "title": "Rule Group Evlauations Too Slow", + "tooltip": { + "shared": false, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [ ] + }, + "yaxes": [ + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": 0, + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": false + } + ] + } + ], + "repeat": null, + "repeatIteration": null, + "repeatRowId": null, + "showTitle": true, + "title": "Rule Group Evaluations", + "titleSize": "h6" + }, + { + "collapse": false, + "height": "250px", + "panels": [ + { + "aliasColors": { }, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "$datasource", + "description": "Shows rate of dropped alerts.", + "fill": 1, + "id": 4, + "legend": { + "avg": false, + "current": false, + "max": false, + "min": false, + "show": true, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 1, + "links": [ ], + "nullPointMode": "null as zero", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [ ], + "spaceLength": 10, + "span": 3, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "sum by (job, alertmanager) (rate(thanos_alert_sender_alerts_dropped_total{job=\"$job\"}[$interval]))", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "{{alertmanager}}", + "legendLink": null, + "step": 10 + } + ], + "thresholds": [ ], + "timeFrom": null, + "timeShift": null, + "title": "Dropped Rate", + "tooltip": { + "shared": false, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [ ] + }, + "yaxes": [ + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": 0, + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": false + } + ] + }, + { + "aliasColors": { }, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "$datasource", + "description": "Shows rate of alerts that successfully sent to alert manager.", + "fill": 10, + "id": 5, + "legend": { + "avg": false, + "current": false, + "max": false, + "min": false, + "show": true, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 0, + "links": [ ], + "nullPointMode": "null as zero", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [ ], + "spaceLength": 10, + "span": 3, + "stack": true, + "steppedLine": false, + "targets": [ + { + "expr": "sum by (job, alertmanager) (rate(thanos_alert_sender_alerts_sent_total{job=\"$job\"}[$interval]))", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "{{alertmanager}}", + "legendLink": null, + "step": 10 + } + ], + "thresholds": [ ], + "timeFrom": null, + "timeShift": null, + "title": "Sent Rate", + "tooltip": { + "shared": false, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [ ] + }, + "yaxes": [ + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": 0, + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": false + } + ] + }, + { + "aliasColors": { + "error": "#E24D42" + }, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "$datasource", + "description": "Shows ratio of errors compared to the total number of sent alerts.", + "fill": 10, + "id": 6, + "legend": { + "avg": false, + "current": false, + "max": false, + "min": false, + "show": true, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 0, + "links": [ ], + "nullPointMode": "null as zero", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [ ], + "spaceLength": 10, + "span": 3, + "stack": true, + "steppedLine": false, + "targets": [ + { + "expr": "sum by (job) (rate(thanos_alert_sender_errors_total{job=\"$job\"}[$interval])) / sum by (job) (rate(thanos_alert_sender_alerts_sent_total{job=\"$job\"}[$interval]))", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "error", + "step": 10 + } + ], + "thresholds": [ ], + "timeFrom": null, + "timeShift": null, + "title": "Sent Errors", + "tooltip": { + "shared": false, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [ ] + }, + "yaxes": [ + { + "format": "percentunit", + "label": null, + "logBase": 1, + "max": null, + "min": 0, + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": false + } + ] + }, + { + "aliasColors": { }, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "$datasource", + "description": "Shows how long has it taken to send alerts to alert manager.", + "fill": 1, + "id": 7, + "legend": { + "avg": false, + "current": false, + "max": false, + "min": false, + "show": true, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 1, + "links": [ ], + "nullPointMode": "null as zero", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [ + { + "alias": "p99", + "color": "#FA6400", + "fill": 1, + "fillGradient": 1 + }, + { + "alias": "p90", + "color": "#E0B400", + "fill": 1, + "fillGradient": 1 + }, + { + "alias": "p50", + "color": "#37872D", + "fill": 10, + "fillGradient": 0 + } + ], + "spaceLength": 10, + "span": 3, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "histogram_quantile(0.50, sum by (job, le) (rate(thanos_alert_sender_latency_seconds_bucket{job=\"$job\"}[$interval]))) * 1", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "p50 {{job}}", + "logBase": 10, + "max": null, + "min": null, + "step": 10 + }, + { + "expr": "histogram_quantile(0.90, sum by (job, le) (rate(thanos_alert_sender_latency_seconds_bucket{job=\"$job\"}[$interval]))) * 1", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "p90 {{job}}", + "logBase": 10, + "max": null, + "min": null, + "step": 10 + }, + { + "expr": "histogram_quantile(0.99, sum by (job, le) (rate(thanos_alert_sender_latency_seconds_bucket{job=\"$job\"}[$interval]))) * 1", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "p99 {{job}}", + "logBase": 10, + "max": null, + "min": null, + "step": 10 + } + ], + "thresholds": [ ], + "timeFrom": null, + "timeShift": null, + "title": "Sent Duration", + "tooltip": { + "shared": false, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [ ] + }, + "yaxes": [ + { + "format": "s", + "label": null, + "logBase": 1, + "max": null, + "min": 0, + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": false + } + ] + } + ], + "repeat": null, + "repeatIteration": null, + "repeatRowId": null, + "showTitle": true, + "title": "Alert Sent", + "titleSize": "h6" + }, + { + "collapse": false, + "height": "250px", + "panels": [ + { + "aliasColors": { }, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "$datasource", + "description": "Shows rate of queued alerts.", + "fill": 1, + "id": 8, + "legend": { + "avg": false, + "current": false, + "max": false, + "min": false, + "show": true, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 1, + "links": [ ], + "nullPointMode": "null as zero", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [ ], + "spaceLength": 10, + "span": 6, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "sum by (job) (rate(thanos_alert_queue_alerts_dropped_total{job=\"$job\"}[$interval]))", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "{{job}}", + "legendLink": null, + "step": 10 + } + ], + "thresholds": [ ], + "timeFrom": null, + "timeShift": null, + "title": "Push Rate", + "tooltip": { + "shared": false, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [ ] + }, + "yaxes": [ + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": 0, + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": false + } + ] + }, + { + "aliasColors": { + "error": "#E24D42" + }, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "$datasource", + "description": "Shows ratio of dropped alerts compared to the total number of queued alerts.", + "fill": 10, + "id": 9, + "legend": { + "avg": false, + "current": false, + "max": false, + "min": false, + "show": true, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 0, + "links": [ ], + "nullPointMode": "null as zero", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [ ], + "spaceLength": 10, + "span": 6, + "stack": true, + "steppedLine": false, + "targets": [ + { + "expr": "sum by (job) (rate(thanos_alert_queue_alerts_dropped_total{job=\"$job\"}[$interval])) / sum by (job) (rate(thanos_alert_queue_alerts_pushed_total{job=\"$job\"}[$interval]))", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "error", + "step": 10 + } + ], + "thresholds": [ ], + "timeFrom": null, + "timeShift": null, + "title": "Drop Ratio", + "tooltip": { + "shared": false, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [ ] + }, + "yaxes": [ + { + "format": "percentunit", + "label": null, + "logBase": 1, + "max": null, + "min": 0, + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": false + } + ] + } + ], + "repeat": null, + "repeatIteration": null, + "repeatRowId": null, + "showTitle": true, + "title": "Alert Queue", + "titleSize": "h6" + }, + { + "collapse": false, + "height": "250px", + "panels": [ + { + "aliasColors": { }, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "$datasource", + "description": "Shows rate of handled Unary gRPC requests.", + "fill": 10, + "id": 10, + "legend": { + "avg": false, + "current": false, + "max": false, + "min": false, + "show": true, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 0, + "links": [ ], + "nullPointMode": "null as zero", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [ + { + "alias": "/Aborted/", + "color": "#EAB839" + }, + { + "alias": "/AlreadyExists/", + "color": "#37872D" + }, + { + "alias": "/FailedPrecondition/", + "color": "#E0B400" + }, + { + "alias": "/Unimplemented/", + "color": "#E0B400" + }, + { + "alias": "/InvalidArgument/", + "color": "#1F60C4" + }, + { + "alias": "/NotFound/", + "color": "#1F60C4" + }, + { + "alias": "/PermissionDenied/", + "color": "#1F60C4" + }, + { + "alias": "/Unauthenticated/", + "color": "#1F60C4" + }, + { + "alias": "/Canceled/", + "color": "#C4162A" + }, + { + "alias": "/DataLoss/", + "color": "#C4162A" + }, + { + "alias": "/DeadlineExceeded/", + "color": "#C4162A" + }, + { + "alias": "/Internal/", + "color": "#C4162A" + }, + { + "alias": "/OutOfRange/", + "color": "#C4162A" + }, + { + "alias": "/ResourceExhausted/", + "color": "#C4162A" + }, + { + "alias": "/Unavailable/", + "color": "#C4162A" + }, + { + "alias": "/Unknown/", + "color": "#C4162A" + }, + { + "alias": "/OK/", + "color": "#37872D" + }, + { + "alias": "error", + "color": "#C4162A" + } + ], + "spaceLength": 10, + "span": 4, + "stack": true, + "steppedLine": false, + "targets": [ + { + "expr": "sum by (job, grpc_method, grpc_code) (rate(grpc_server_handled_total{job=\"$job\", grpc_type=\"unary\"}[$interval]))", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "{{job}} {{grpc_method}} {{grpc_code}}", + "step": 10 + } + ], + "thresholds": [ ], + "timeFrom": null, + "timeShift": null, + "title": "Rate", + "tooltip": { + "shared": false, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [ ] + }, + "yaxes": [ + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": 0, + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": false + } + ] + }, + { + "aliasColors": { + "error": "#E24D42" + }, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "$datasource", + "description": "Shows ratio of errors compared to the total number of handled requests.", + "fill": 10, + "id": 11, + "legend": { + "avg": false, + "current": false, + "max": false, + "min": false, + "show": true, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 0, + "links": [ ], + "nullPointMode": "null as zero", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [ ], + "spaceLength": 10, + "span": 4, + "stack": true, + "steppedLine": false, + "targets": [ + { + "expr": "sum by (job) (rate(grpc_server_handled_total{grpc_code=~\"Unknown|ResourceExhausted|Internal|Unavailable|DataLoss\",job=\"$job\", grpc_type=\"unary\"}[$interval])) / sum by (job) (rate(grpc_server_handled_total{job=\"$job\", grpc_type=\"unary\"}[$interval]))", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "error", + "step": 10 + } + ], + "thresholds": [ ], + "timeFrom": null, + "timeShift": null, + "title": "Errors", + "tooltip": { + "shared": false, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [ ] + }, + "yaxes": [ + { + "format": "percentunit", + "label": null, + "logBase": 1, + "max": null, + "min": 0, + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": false + } + ] + }, + { + "aliasColors": { }, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "$datasource", + "description": "Shows how long has it taken to handle requests, in quantiles.", + "fill": 1, + "id": 12, + "legend": { + "avg": false, + "current": false, + "max": false, + "min": false, + "show": true, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 1, + "links": [ ], + "nullPointMode": "null as zero", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [ + { + "alias": "p99", + "color": "#FA6400", + "fill": 1, + "fillGradient": 1 + }, + { + "alias": "p90", + "color": "#E0B400", + "fill": 1, + "fillGradient": 1 + }, + { + "alias": "p50", + "color": "#37872D", + "fill": 10, + "fillGradient": 0 + } + ], + "spaceLength": 10, + "span": 4, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "histogram_quantile(0.50, sum by (job, le) (rate(grpc_server_handling_seconds_bucket{job=\"$job\", grpc_type=\"unary\"}[$interval]))) * 1", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "p50 {{job}}", + "logBase": 10, + "max": null, + "min": null, + "step": 10 + }, + { + "expr": "histogram_quantile(0.90, sum by (job, le) (rate(grpc_server_handling_seconds_bucket{job=\"$job\", grpc_type=\"unary\"}[$interval]))) * 1", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "p90 {{job}}", + "logBase": 10, + "max": null, + "min": null, + "step": 10 + }, + { + "expr": "histogram_quantile(0.99, sum by (job, le) (rate(grpc_server_handling_seconds_bucket{job=\"$job\", grpc_type=\"unary\"}[$interval]))) * 1", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "p99 {{job}}", + "logBase": 10, + "max": null, + "min": null, + "step": 10 + } + ], + "thresholds": [ ], + "timeFrom": null, + "timeShift": null, + "title": "Duration", + "tooltip": { + "shared": false, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [ ] + }, + "yaxes": [ + { + "format": "s", + "label": null, + "logBase": 1, + "max": null, + "min": 0, + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": false + } + ] + } + ], + "repeat": null, + "repeatIteration": null, + "repeatRowId": null, + "showTitle": true, + "title": "gRPC (Unary)", + "titleSize": "h6" + }, + { + "collapse": false, + "height": "250px", + "panels": [ + { + "aliasColors": { }, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "$datasource", + "description": "Shows rate of handled Streamed gRPC requests.", + "fill": 10, + "id": 13, + "legend": { + "avg": false, + "current": false, + "max": false, + "min": false, + "show": true, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 0, + "links": [ ], + "nullPointMode": "null as zero", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [ + { + "alias": "/Aborted/", + "color": "#EAB839" + }, + { + "alias": "/AlreadyExists/", + "color": "#37872D" + }, + { + "alias": "/FailedPrecondition/", + "color": "#E0B400" + }, + { + "alias": "/Unimplemented/", + "color": "#E0B400" + }, + { + "alias": "/InvalidArgument/", + "color": "#1F60C4" + }, + { + "alias": "/NotFound/", + "color": "#1F60C4" + }, + { + "alias": "/PermissionDenied/", + "color": "#1F60C4" + }, + { + "alias": "/Unauthenticated/", + "color": "#1F60C4" + }, + { + "alias": "/Canceled/", + "color": "#C4162A" + }, + { + "alias": "/DataLoss/", + "color": "#C4162A" + }, + { + "alias": "/DeadlineExceeded/", + "color": "#C4162A" + }, + { + "alias": "/Internal/", + "color": "#C4162A" + }, + { + "alias": "/OutOfRange/", + "color": "#C4162A" + }, + { + "alias": "/ResourceExhausted/", + "color": "#C4162A" + }, + { + "alias": "/Unavailable/", + "color": "#C4162A" + }, + { + "alias": "/Unknown/", + "color": "#C4162A" + }, + { + "alias": "/OK/", + "color": "#37872D" + }, + { + "alias": "error", + "color": "#C4162A" + } + ], + "spaceLength": 10, + "span": 4, + "stack": true, + "steppedLine": false, + "targets": [ + { + "expr": "sum by (job, grpc_method, grpc_code) (rate(grpc_server_handled_total{job=\"$job\", grpc_type=\"server_stream\"}[$interval]))", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "{{job}} {{grpc_method}} {{grpc_code}}", + "step": 10 + } + ], + "thresholds": [ ], + "timeFrom": null, + "timeShift": null, + "title": "Rate", + "tooltip": { + "shared": false, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [ ] + }, + "yaxes": [ + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": 0, + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": false + } + ] + }, + { + "aliasColors": { + "error": "#E24D42" + }, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "$datasource", + "description": "Shows ratio of errors compared to the total number of handled requests.", + "fill": 10, + "id": 14, + "legend": { + "avg": false, + "current": false, + "max": false, + "min": false, + "show": true, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 0, + "links": [ ], + "nullPointMode": "null as zero", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [ ], + "spaceLength": 10, + "span": 4, + "stack": true, + "steppedLine": false, + "targets": [ + { + "expr": "sum by (job) (rate(grpc_server_handled_total{grpc_code=~\"Unknown|ResourceExhausted|Internal|Unavailable|DataLoss\",job=\"$job\", grpc_type=\"server_stream\"}[$interval])) / sum by (job) (rate(grpc_server_handled_total{job=\"$job\", grpc_type=\"server_stream\"}[$interval]))", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "error", + "step": 10 + } + ], + "thresholds": [ ], + "timeFrom": null, + "timeShift": null, + "title": "Errors", + "tooltip": { + "shared": false, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [ ] + }, + "yaxes": [ + { + "format": "percentunit", + "label": null, + "logBase": 1, + "max": null, + "min": 0, + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": false + } + ] + }, + { + "aliasColors": { }, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "$datasource", + "description": "Shows how long has it taken to handle requests, in quantiles", + "fill": 1, + "id": 15, + "legend": { + "avg": false, + "current": false, + "max": false, + "min": false, + "show": true, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 1, + "links": [ ], + "nullPointMode": "null as zero", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [ + { + "alias": "p99", + "color": "#FA6400", + "fill": 1, + "fillGradient": 1 + }, + { + "alias": "p90", + "color": "#E0B400", + "fill": 1, + "fillGradient": 1 + }, + { + "alias": "p50", + "color": "#37872D", + "fill": 10, + "fillGradient": 0 + } + ], + "spaceLength": 10, + "span": 4, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "histogram_quantile(0.50, sum by (job, le) (rate(grpc_server_handling_seconds_bucket{job=\"$job\", grpc_type=\"server_stream\"}[$interval]))) * 1", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "p50 {{job}}", + "logBase": 10, + "max": null, + "min": null, + "step": 10 + }, + { + "expr": "histogram_quantile(0.90, sum by (job, le) (rate(grpc_server_handling_seconds_bucket{job=\"$job\", grpc_type=\"server_stream\"}[$interval]))) * 1", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "p90 {{job}}", + "logBase": 10, + "max": null, + "min": null, + "step": 10 + }, + { + "expr": "histogram_quantile(0.99, sum by (job, le) (rate(grpc_server_handling_seconds_bucket{job=\"$job\", grpc_type=\"server_stream\"}[$interval]))) * 1", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "p99 {{job}}", + "logBase": 10, + "max": null, + "min": null, + "step": 10 + } + ], + "thresholds": [ ], + "timeFrom": null, + "timeShift": null, + "title": "Duration", + "tooltip": { + "shared": false, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [ ] + }, + "yaxes": [ + { + "format": "s", + "label": null, + "logBase": 1, + "max": null, + "min": 0, + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": false + } + ] + } + ], + "repeat": null, + "repeatIteration": null, + "repeatRowId": null, + "showTitle": true, + "title": "gRPC (Stream)", + "titleSize": "h6" + }, + { + "collapse": true, + "height": "250px", + "panels": [ + { + "aliasColors": { }, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "$datasource", + "fill": 1, + "id": 16, + "legend": { + "avg": false, + "current": false, + "max": false, + "min": false, + "show": true, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 1, + "links": [ ], + "nullPointMode": "null as zero", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [ ], + "spaceLength": 10, + "span": 4, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "go_memstats_alloc_bytes{job=\"$job\"}", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "alloc all {{instance}}", + "legendLink": null, + "step": 10 + }, + { + "expr": "go_memstats_heap_alloc_bytes{job=\"$job\"}", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "alloc heap {{instance}}", + "legendLink": null, + "step": 10 + }, + { + "expr": "rate(go_memstats_alloc_bytes_total{job=\"$job\"}[30s])", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "alloc rate all {{instance}}", + "legendLink": null, + "step": 10 + }, + { + "expr": "rate(go_memstats_heap_alloc_bytes{job=\"$job\"}[30s])", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "alloc rate heap {{instance}}", + "legendLink": null, + "step": 10 + }, + { + "expr": "go_memstats_stack_inuse_bytes{job=\"$job\"}", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "inuse heap {{instance}}", + "legendLink": null, + "step": 10 + }, + { + "expr": "go_memstats_heap_inuse_bytes{job=\"$job\"}", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "inuse stack {{instance}}", + "legendLink": null, + "step": 10 + } + ], + "thresholds": [ ], + "timeFrom": null, + "timeShift": null, + "title": "Memory Used", + "tooltip": { + "shared": false, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [ ] + }, + "yaxes": [ + { + "format": "bytes", + "label": null, + "logBase": 1, + "max": null, + "min": 0, + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": false + } + ] + }, + { + "aliasColors": { }, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "$datasource", + "fill": 1, + "id": 17, + "legend": { + "avg": false, + "current": false, + "max": false, + "min": false, + "show": true, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 1, + "links": [ ], + "nullPointMode": "null as zero", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [ ], + "spaceLength": 10, + "span": 4, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "go_goroutines{job=\"$job\"}", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "{{instance}}", + "legendLink": null, + "step": 10 + } + ], + "thresholds": [ ], + "timeFrom": null, + "timeShift": null, + "title": "Goroutines", + "tooltip": { + "shared": false, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [ ] + }, + "yaxes": [ + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": 0, + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": false + } + ] + }, + { + "aliasColors": { }, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "$datasource", + "fill": 1, + "id": 18, + "legend": { + "avg": false, + "current": false, + "max": false, + "min": false, + "show": true, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 1, + "links": [ ], + "nullPointMode": "null as zero", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [ ], + "spaceLength": 10, + "span": 4, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "go_gc_duration_seconds{job=\"$job\"}", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "{{quantile}} {{instance}}", + "legendLink": null, + "step": 10 + } + ], + "thresholds": [ ], + "timeFrom": null, + "timeShift": null, + "title": "GC Time Quantiles", + "tooltip": { + "shared": false, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [ ] + }, + "yaxes": [ + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": 0, + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": false + } + ] + } + ], + "repeat": null, + "repeatIteration": null, + "repeatRowId": null, + "showTitle": true, + "title": "Resources", + "titleSize": "h6" + } + ], + "schemaVersion": 14, + "style": "dark", + "tags": [ + "thanos-mixin", + "portefaix-v0.6.0" + ], + "templating": { + "list": [ + { + "current": { + "text": "default", + "value": "default" + }, + "hide": 0, + "label": null, + "name": "datasource", + "options": [ ], + "query": "prometheus", + "refresh": 1, + "regex": "", + "type": "datasource" + }, + { + "auto": true, + "auto_count": 300, + "auto_min": "10s", + "current": { + "text": "5m", + "value": "5m" + }, + "hide": 0, + "label": "interval", + "name": "interval", + "query": "5m,10m,30m,1h,6h,12h", + "refresh": 2, + "type": "interval" + }, + { + "allValue": null, + "current": { + "text": "all", + "value": "$__all" + }, + "datasource": "$datasource", + "hide": 0, + "includeAll": true, + "label": "job", + "multi": false, + "name": "job", + "options": [ ], + "query": "label_values(up{job=~\".*thanos-rule.*\"}, job)", + "refresh": 1, + "regex": "", + "sort": 2, + "tagValuesQuery": "", + "tags": [ ], + "tagsQuery": "", + "type": "query", + "useTags": false + } + ] + }, + "time": { + "from": "now-1h", + "to": "now" + }, + "timepicker": { + "refresh_intervals": [ + "5s", + "10s", + "30s", + "1m", + "5m", + "15m", + "30m", + "1h", + "2h", + "1d" + ], + "time_options": [ + "5m", + "15m", + "1h", + "6h", + "12h", + "24h", + "2d", + "7d", + "30d" + ] + }, + "timezone": "UTC", + "title": "Thanos / Rule", + "uid": "35da848f5f92b2dc612e0c3a0577b8a1", + "version": 0 +} diff --git a/charts/thanos-mixin/dashboards/sidecar.json b/charts/thanos-mixin/dashboards/sidecar.json new file mode 100644 index 00000000..ae3fdf5e --- /dev/null +++ b/charts/thanos-mixin/dashboards/sidecar.json @@ -0,0 +1,1508 @@ +{ + "annotations": { + "list": [ ] + }, + "editable": true, + "gnetId": null, + "graphTooltip": 0, + "hideControls": false, + "links": [ ], + "refresh": "10s", + "rows": [ + { + "collapse": false, + "height": "250px", + "panels": [ + { + "aliasColors": { }, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "$datasource", + "description": "Shows rate of handled Unary gRPC requests from queriers.", + "fill": 10, + "id": 1, + "legend": { + "avg": false, + "current": false, + "max": false, + "min": false, + "show": true, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 0, + "links": [ ], + "nullPointMode": "null as zero", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [ + { + "alias": "/Aborted/", + "color": "#EAB839" + }, + { + "alias": "/AlreadyExists/", + "color": "#37872D" + }, + { + "alias": "/FailedPrecondition/", + "color": "#E0B400" + }, + { + "alias": "/Unimplemented/", + "color": "#E0B400" + }, + { + "alias": "/InvalidArgument/", + "color": "#1F60C4" + }, + { + "alias": "/NotFound/", + "color": "#1F60C4" + }, + { + "alias": "/PermissionDenied/", + "color": "#1F60C4" + }, + { + "alias": "/Unauthenticated/", + "color": "#1F60C4" + }, + { + "alias": "/Canceled/", + "color": "#C4162A" + }, + { + "alias": "/DataLoss/", + "color": "#C4162A" + }, + { + "alias": "/DeadlineExceeded/", + "color": "#C4162A" + }, + { + "alias": "/Internal/", + "color": "#C4162A" + }, + { + "alias": "/OutOfRange/", + "color": "#C4162A" + }, + { + "alias": "/ResourceExhausted/", + "color": "#C4162A" + }, + { + "alias": "/Unavailable/", + "color": "#C4162A" + }, + { + "alias": "/Unknown/", + "color": "#C4162A" + }, + { + "alias": "/OK/", + "color": "#37872D" + }, + { + "alias": "error", + "color": "#C4162A" + } + ], + "spaceLength": 10, + "span": 4, + "stack": true, + "steppedLine": false, + "targets": [ + { + "expr": "sum by (job, grpc_method, grpc_code) (rate(grpc_server_handled_total{job=\"$job\", grpc_type=\"unary\"}[$interval]))", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "{{job}} {{grpc_method}} {{grpc_code}}", + "step": 10 + } + ], + "thresholds": [ ], + "timeFrom": null, + "timeShift": null, + "title": "Rate", + "tooltip": { + "shared": false, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [ ] + }, + "yaxes": [ + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": 0, + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": false + } + ] + }, + { + "aliasColors": { + "error": "#E24D42" + }, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "$datasource", + "description": "Shows ratio of errors compared to the total number of handled requests from queriers.", + "fill": 10, + "id": 2, + "legend": { + "avg": false, + "current": false, + "max": false, + "min": false, + "show": true, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 0, + "links": [ ], + "nullPointMode": "null as zero", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [ ], + "spaceLength": 10, + "span": 4, + "stack": true, + "steppedLine": false, + "targets": [ + { + "expr": "sum by (job) (rate(grpc_server_handled_total{grpc_code=~\"Unknown|ResourceExhausted|Internal|Unavailable|DataLoss\",job=\"$job\", grpc_type=\"unary\"}[$interval])) / sum by (job) (rate(grpc_server_handled_total{job=\"$job\", grpc_type=\"unary\"}[$interval]))", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "error", + "step": 10 + } + ], + "thresholds": [ ], + "timeFrom": null, + "timeShift": null, + "title": "Errors", + "tooltip": { + "shared": false, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [ ] + }, + "yaxes": [ + { + "format": "percentunit", + "label": null, + "logBase": 1, + "max": null, + "min": 0, + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": false + } + ] + }, + { + "aliasColors": { }, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "$datasource", + "description": "Shows how long has it taken to handle requests from queriers, in quantiles.", + "fill": 1, + "id": 3, + "legend": { + "avg": false, + "current": false, + "max": false, + "min": false, + "show": true, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 1, + "links": [ ], + "nullPointMode": "null as zero", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [ + { + "alias": "p99", + "color": "#FA6400", + "fill": 1, + "fillGradient": 1 + }, + { + "alias": "p90", + "color": "#E0B400", + "fill": 1, + "fillGradient": 1 + }, + { + "alias": "p50", + "color": "#37872D", + "fill": 10, + "fillGradient": 0 + } + ], + "spaceLength": 10, + "span": 4, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "histogram_quantile(0.50, sum by (job, le) (rate(grpc_server_handling_seconds_bucket{job=\"$job\", grpc_type=\"unary\"}[$interval]))) * 1", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "p50 {{job}}", + "logBase": 10, + "max": null, + "min": null, + "step": 10 + }, + { + "expr": "histogram_quantile(0.90, sum by (job, le) (rate(grpc_server_handling_seconds_bucket{job=\"$job\", grpc_type=\"unary\"}[$interval]))) * 1", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "p90 {{job}}", + "logBase": 10, + "max": null, + "min": null, + "step": 10 + }, + { + "expr": "histogram_quantile(0.99, sum by (job, le) (rate(grpc_server_handling_seconds_bucket{job=\"$job\", grpc_type=\"unary\"}[$interval]))) * 1", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "p99 {{job}}", + "logBase": 10, + "max": null, + "min": null, + "step": 10 + } + ], + "thresholds": [ ], + "timeFrom": null, + "timeShift": null, + "title": "Duration", + "tooltip": { + "shared": false, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [ ] + }, + "yaxes": [ + { + "format": "s", + "label": null, + "logBase": 1, + "max": null, + "min": 0, + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": false + } + ] + } + ], + "repeat": null, + "repeatIteration": null, + "repeatRowId": null, + "showTitle": true, + "title": "gRPC (Unary)", + "titleSize": "h6" + }, + { + "collapse": false, + "height": "250px", + "panels": [ + { + "aliasColors": { }, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "$datasource", + "description": "Shows rate of handled Streamed gRPC requests from queriers.", + "fill": 10, + "id": 4, + "legend": { + "avg": false, + "current": false, + "max": false, + "min": false, + "show": true, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 0, + "links": [ ], + "nullPointMode": "null as zero", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [ + { + "alias": "/Aborted/", + "color": "#EAB839" + }, + { + "alias": "/AlreadyExists/", + "color": "#37872D" + }, + { + "alias": "/FailedPrecondition/", + "color": "#E0B400" + }, + { + "alias": "/Unimplemented/", + "color": "#E0B400" + }, + { + "alias": "/InvalidArgument/", + "color": "#1F60C4" + }, + { + "alias": "/NotFound/", + "color": "#1F60C4" + }, + { + "alias": "/PermissionDenied/", + "color": "#1F60C4" + }, + { + "alias": "/Unauthenticated/", + "color": "#1F60C4" + }, + { + "alias": "/Canceled/", + "color": "#C4162A" + }, + { + "alias": "/DataLoss/", + "color": "#C4162A" + }, + { + "alias": "/DeadlineExceeded/", + "color": "#C4162A" + }, + { + "alias": "/Internal/", + "color": "#C4162A" + }, + { + "alias": "/OutOfRange/", + "color": "#C4162A" + }, + { + "alias": "/ResourceExhausted/", + "color": "#C4162A" + }, + { + "alias": "/Unavailable/", + "color": "#C4162A" + }, + { + "alias": "/Unknown/", + "color": "#C4162A" + }, + { + "alias": "/OK/", + "color": "#37872D" + }, + { + "alias": "error", + "color": "#C4162A" + } + ], + "spaceLength": 10, + "span": 4, + "stack": true, + "steppedLine": false, + "targets": [ + { + "expr": "sum by (job, grpc_method, grpc_code) (rate(grpc_server_handled_total{job=\"$job\", grpc_type=\"server_stream\"}[$interval]))", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "{{job}} {{grpc_method}} {{grpc_code}}", + "step": 10 + } + ], + "thresholds": [ ], + "timeFrom": null, + "timeShift": null, + "title": "Rate", + "tooltip": { + "shared": false, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [ ] + }, + "yaxes": [ + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": 0, + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": false + } + ] + }, + { + "aliasColors": { + "error": "#E24D42" + }, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "$datasource", + "fill": 10, + "id": 5, + "legend": { + "avg": false, + "current": false, + "max": false, + "min": false, + "show": true, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 0, + "links": [ ], + "nullPointMode": "null as zero", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [ ], + "spaceLength": 10, + "span": 4, + "stack": true, + "steppedLine": false, + "targets": [ + { + "expr": "sum by (job) (rate(grpc_server_handled_total{grpc_code=~\"Unknown|ResourceExhausted|Internal|Unavailable|DataLoss\",job=\"$job\", grpc_type=\"server_stream\"}[$interval])) / sum by (job) (rate(grpc_server_handled_total{job=\"$job\", grpc_type=\"server_stream\"}[$interval]))", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "error", + "step": 10 + } + ], + "thresholds": [ ], + "timeFrom": null, + "timeShift": null, + "title": "Errors", + "tooltip": { + "shared": false, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [ ] + }, + "yaxes": [ + { + "format": "percentunit", + "label": null, + "logBase": 1, + "max": null, + "min": 0, + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": false + } + ] + }, + { + "aliasColors": { }, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "$datasource", + "description": "Shows how long has it taken to handle requests from queriers, in quantiles.", + "fill": 1, + "id": 6, + "legend": { + "avg": false, + "current": false, + "max": false, + "min": false, + "show": true, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 1, + "links": [ ], + "nullPointMode": "null as zero", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [ + { + "alias": "p99", + "color": "#FA6400", + "fill": 1, + "fillGradient": 1 + }, + { + "alias": "p90", + "color": "#E0B400", + "fill": 1, + "fillGradient": 1 + }, + { + "alias": "p50", + "color": "#37872D", + "fill": 10, + "fillGradient": 0 + } + ], + "spaceLength": 10, + "span": 4, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "histogram_quantile(0.50, sum by (job, le) (rate(grpc_server_handling_seconds_bucket{job=\"$job\", grpc_type=\"server_stream\"}[$interval]))) * 1", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "p50 {{job}}", + "logBase": 10, + "max": null, + "min": null, + "step": 10 + }, + { + "expr": "histogram_quantile(0.90, sum by (job, le) (rate(grpc_server_handling_seconds_bucket{job=\"$job\", grpc_type=\"server_stream\"}[$interval]))) * 1", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "p90 {{job}}", + "logBase": 10, + "max": null, + "min": null, + "step": 10 + }, + { + "expr": "histogram_quantile(0.99, sum by (job, le) (rate(grpc_server_handling_seconds_bucket{job=\"$job\", grpc_type=\"server_stream\"}[$interval]))) * 1", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "p99 {{job}}", + "logBase": 10, + "max": null, + "min": null, + "step": 10 + } + ], + "thresholds": [ ], + "timeFrom": null, + "timeShift": null, + "title": "Duration", + "tooltip": { + "shared": false, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [ ] + }, + "yaxes": [ + { + "format": "s", + "label": null, + "logBase": 1, + "max": null, + "min": 0, + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": false + } + ] + } + ], + "repeat": null, + "repeatIteration": null, + "repeatRowId": null, + "showTitle": true, + "title": "gRPC (Stream)", + "titleSize": "h6" + }, + { + "collapse": false, + "height": "250px", + "panels": [ + { + "aliasColors": { }, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "$datasource", + "description": "Shows the relative time of last successful upload to the object-store bucket.", + "fill": 1, + "id": 7, + "legend": { + "avg": false, + "current": false, + "max": false, + "min": false, + "show": true, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 1, + "links": [ ], + "nullPointMode": "null as zero", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [ ], + "spaceLength": 10, + "span": 12, + "stack": false, + "steppedLine": false, + "styles": [ + { + "alias": "Time", + "dateFormat": "YYYY-MM-DD HH:mm:ss", + "pattern": "Time", + "type": "hidden" + }, + { + "alias": "Uploaded Ago", + "colorMode": null, + "colors": [ ], + "dateFormat": "YYYY-MM-DD HH:mm:ss", + "decimals": 2, + "link": false, + "linkTargetBlank": false, + "linkTooltip": "Drill down", + "linkUrl": "", + "pattern": "Value", + "thresholds": [ ], + "type": "number", + "unit": "s" + }, + { + "alias": "", + "colorMode": null, + "colors": [ ], + "dateFormat": "YYYY-MM-DD HH:mm:ss", + "decimals": 2, + "pattern": "/.*/", + "thresholds": [ ], + "type": "string", + "unit": "short" + } + ], + "targets": [ + { + "expr": "time() - max by (job, bucket) (thanos_objstore_bucket_last_successful_upload_time{job=\"$job\"})", + "format": "table", + "instant": true, + "intervalFactor": 2, + "legendFormat": "", + "refId": "A", + "step": 10 + } + ], + "thresholds": [ ], + "timeFrom": null, + "timeShift": null, + "title": "Successful Upload", + "tooltip": { + "shared": false, + "sort": 0, + "value_type": "individual" + }, + "transform": "table", + "type": "table", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [ ] + }, + "yaxes": [ + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": 0, + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": false + } + ] + } + ], + "repeat": null, + "repeatIteration": null, + "repeatRowId": null, + "showTitle": true, + "title": "Last Updated", + "titleSize": "h6" + }, + { + "collapse": false, + "height": "250px", + "panels": [ + { + "aliasColors": { }, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "$datasource", + "fill": 10, + "id": 8, + "legend": { + "avg": false, + "current": false, + "max": false, + "min": false, + "show": true, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 0, + "links": [ ], + "nullPointMode": "null as zero", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [ ], + "spaceLength": 10, + "span": 4, + "stack": true, + "steppedLine": false, + "targets": [ + { + "expr": "sum by (job, operation) (rate(thanos_objstore_bucket_operations_total{job=\"$job\"}[$interval]))", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "{{job}} {{operation}}", + "legendLink": null, + "step": 10 + } + ], + "thresholds": [ ], + "timeFrom": null, + "timeShift": null, + "title": "Rate", + "tooltip": { + "shared": false, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [ ] + }, + "yaxes": [ + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": 0, + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": false + } + ] + }, + { + "aliasColors": { + "error": "#E24D42" + }, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "$datasource", + "fill": 10, + "id": 9, + "legend": { + "avg": false, + "current": false, + "max": false, + "min": false, + "show": true, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 0, + "links": [ ], + "nullPointMode": "null as zero", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [ ], + "spaceLength": 10, + "span": 4, + "stack": true, + "steppedLine": false, + "targets": [ + { + "expr": "sum by (job) (rate(thanos_objstore_bucket_operation_failures_total{job=\"$job\"}[$interval])) / sum by (job) (rate(thanos_objstore_bucket_operations_total{job=\"$job\"}[$interval]))", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "error", + "step": 10 + } + ], + "thresholds": [ ], + "timeFrom": null, + "timeShift": null, + "title": "Errors", + "tooltip": { + "shared": false, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [ ] + }, + "yaxes": [ + { + "format": "percentunit", + "label": null, + "logBase": 1, + "max": null, + "min": 0, + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": false + } + ] + }, + { + "aliasColors": { }, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "$datasource", + "fill": 1, + "id": 10, + "legend": { + "avg": false, + "current": false, + "max": false, + "min": false, + "show": true, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 1, + "links": [ ], + "nullPointMode": "null as zero", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [ + { + "alias": "p99", + "color": "#FA6400", + "fill": 1, + "fillGradient": 1 + }, + { + "alias": "p90", + "color": "#E0B400", + "fill": 1, + "fillGradient": 1 + }, + { + "alias": "p50", + "color": "#37872D", + "fill": 10, + "fillGradient": 0 + } + ], + "spaceLength": 10, + "span": 4, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "histogram_quantile(0.50, sum by (job, le) (rate(thanos_objstore_bucket_operation_duration_seconds_bucket{job=\"$job\"}[$interval]))) * 1", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "p50 {{job}}", + "logBase": 10, + "max": null, + "min": null, + "step": 10 + }, + { + "expr": "histogram_quantile(0.90, sum by (job, le) (rate(thanos_objstore_bucket_operation_duration_seconds_bucket{job=\"$job\"}[$interval]))) * 1", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "p90 {{job}}", + "logBase": 10, + "max": null, + "min": null, + "step": 10 + }, + { + "expr": "histogram_quantile(0.99, sum by (job, le) (rate(thanos_objstore_bucket_operation_duration_seconds_bucket{job=\"$job\"}[$interval]))) * 1", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "p99 {{job}}", + "logBase": 10, + "max": null, + "min": null, + "step": 10 + } + ], + "thresholds": [ ], + "timeFrom": null, + "timeShift": null, + "title": "Duration", + "tooltip": { + "shared": false, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [ ] + }, + "yaxes": [ + { + "format": "s", + "label": null, + "logBase": 1, + "max": null, + "min": 0, + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": false + } + ] + } + ], + "repeat": null, + "repeatIteration": null, + "repeatRowId": null, + "showTitle": true, + "title": "Bucket Operations", + "titleSize": "h6" + }, + { + "collapse": true, + "height": "250px", + "panels": [ + { + "aliasColors": { }, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "$datasource", + "fill": 1, + "id": 11, + "legend": { + "avg": false, + "current": false, + "max": false, + "min": false, + "show": true, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 1, + "links": [ ], + "nullPointMode": "null as zero", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [ ], + "spaceLength": 10, + "span": 4, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "go_memstats_alloc_bytes{job=\"$job\"}", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "alloc all {{instance}}", + "legendLink": null, + "step": 10 + }, + { + "expr": "go_memstats_heap_alloc_bytes{job=\"$job\"}", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "alloc heap {{instance}}", + "legendLink": null, + "step": 10 + }, + { + "expr": "rate(go_memstats_alloc_bytes_total{job=\"$job\"}[30s])", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "alloc rate all {{instance}}", + "legendLink": null, + "step": 10 + }, + { + "expr": "rate(go_memstats_heap_alloc_bytes{job=\"$job\"}[30s])", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "alloc rate heap {{instance}}", + "legendLink": null, + "step": 10 + }, + { + "expr": "go_memstats_stack_inuse_bytes{job=\"$job\"}", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "inuse heap {{instance}}", + "legendLink": null, + "step": 10 + }, + { + "expr": "go_memstats_heap_inuse_bytes{job=\"$job\"}", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "inuse stack {{instance}}", + "legendLink": null, + "step": 10 + } + ], + "thresholds": [ ], + "timeFrom": null, + "timeShift": null, + "title": "Memory Used", + "tooltip": { + "shared": false, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [ ] + }, + "yaxes": [ + { + "format": "bytes", + "label": null, + "logBase": 1, + "max": null, + "min": 0, + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": false + } + ] + }, + { + "aliasColors": { }, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "$datasource", + "fill": 1, + "id": 12, + "legend": { + "avg": false, + "current": false, + "max": false, + "min": false, + "show": true, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 1, + "links": [ ], + "nullPointMode": "null as zero", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [ ], + "spaceLength": 10, + "span": 4, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "go_goroutines{job=\"$job\"}", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "{{instance}}", + "legendLink": null, + "step": 10 + } + ], + "thresholds": [ ], + "timeFrom": null, + "timeShift": null, + "title": "Goroutines", + "tooltip": { + "shared": false, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [ ] + }, + "yaxes": [ + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": 0, + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": false + } + ] + }, + { + "aliasColors": { }, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "$datasource", + "fill": 1, + "id": 13, + "legend": { + "avg": false, + "current": false, + "max": false, + "min": false, + "show": true, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 1, + "links": [ ], + "nullPointMode": "null as zero", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [ ], + "spaceLength": 10, + "span": 4, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "go_gc_duration_seconds{job=\"$job\"}", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "{{quantile}} {{instance}}", + "legendLink": null, + "step": 10 + } + ], + "thresholds": [ ], + "timeFrom": null, + "timeShift": null, + "title": "GC Time Quantiles", + "tooltip": { + "shared": false, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [ ] + }, + "yaxes": [ + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": 0, + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": false + } + ] + } + ], + "repeat": null, + "repeatIteration": null, + "repeatRowId": null, + "showTitle": true, + "title": "Resources", + "titleSize": "h6" + } + ], + "schemaVersion": 14, + "style": "dark", + "tags": [ + "thanos-mixin", + "portefaix-v0.6.0" + ], + "templating": { + "list": [ + { + "current": { + "text": "default", + "value": "default" + }, + "hide": 0, + "label": null, + "name": "datasource", + "options": [ ], + "query": "prometheus", + "refresh": 1, + "regex": "", + "type": "datasource" + }, + { + "auto": true, + "auto_count": 300, + "auto_min": "10s", + "current": { + "text": "5m", + "value": "5m" + }, + "hide": 0, + "label": "interval", + "name": "interval", + "query": "5m,10m,30m,1h,6h,12h", + "refresh": 2, + "type": "interval" + }, + { + "allValue": null, + "current": { + "text": "all", + "value": "$__all" + }, + "datasource": "$datasource", + "hide": 0, + "includeAll": true, + "label": "job", + "multi": false, + "name": "job", + "options": [ ], + "query": "label_values(up{job=~\".*thanos-sidecar.*\"}, job)", + "refresh": 1, + "regex": "", + "sort": 2, + "tagValuesQuery": "", + "tags": [ ], + "tagsQuery": "", + "type": "query", + "useTags": false + } + ] + }, + "time": { + "from": "now-1h", + "to": "now" + }, + "timepicker": { + "refresh_intervals": [ + "5s", + "10s", + "30s", + "1m", + "5m", + "15m", + "30m", + "1h", + "2h", + "1d" + ], + "time_options": [ + "5m", + "15m", + "1h", + "6h", + "12h", + "24h", + "2d", + "7d", + "30d" + ] + }, + "timezone": "UTC", + "title": "Thanos / Sidecar", + "uid": "b19644bfbf0ec1e108027cce268d99f7", + "version": 0 +} diff --git a/charts/thanos-mixin/dashboards/store.json b/charts/thanos-mixin/dashboards/store.json new file mode 100644 index 00000000..75787c10 --- /dev/null +++ b/charts/thanos-mixin/dashboards/store.json @@ -0,0 +1,2763 @@ +{ + "annotations": { + "list": [ ] + }, + "editable": true, + "gnetId": null, + "graphTooltip": 0, + "hideControls": false, + "links": [ ], + "refresh": "10s", + "rows": [ + { + "collapse": false, + "height": "250px", + "panels": [ + { + "aliasColors": { }, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "$datasource", + "description": "Shows rate of handled Unary gRPC requests from queriers.", + "fill": 10, + "id": 1, + "legend": { + "avg": false, + "current": false, + "max": false, + "min": false, + "show": true, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 0, + "links": [ ], + "nullPointMode": "null as zero", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [ + { + "alias": "/Aborted/", + "color": "#EAB839" + }, + { + "alias": "/AlreadyExists/", + "color": "#37872D" + }, + { + "alias": "/FailedPrecondition/", + "color": "#E0B400" + }, + { + "alias": "/Unimplemented/", + "color": "#E0B400" + }, + { + "alias": "/InvalidArgument/", + "color": "#1F60C4" + }, + { + "alias": "/NotFound/", + "color": "#1F60C4" + }, + { + "alias": "/PermissionDenied/", + "color": "#1F60C4" + }, + { + "alias": "/Unauthenticated/", + "color": "#1F60C4" + }, + { + "alias": "/Canceled/", + "color": "#C4162A" + }, + { + "alias": "/DataLoss/", + "color": "#C4162A" + }, + { + "alias": "/DeadlineExceeded/", + "color": "#C4162A" + }, + { + "alias": "/Internal/", + "color": "#C4162A" + }, + { + "alias": "/OutOfRange/", + "color": "#C4162A" + }, + { + "alias": "/ResourceExhausted/", + "color": "#C4162A" + }, + { + "alias": "/Unavailable/", + "color": "#C4162A" + }, + { + "alias": "/Unknown/", + "color": "#C4162A" + }, + { + "alias": "/OK/", + "color": "#37872D" + }, + { + "alias": "error", + "color": "#C4162A" + } + ], + "spaceLength": 10, + "span": 4, + "stack": true, + "steppedLine": false, + "targets": [ + { + "expr": "sum by (job, grpc_method, grpc_code) (rate(grpc_server_handled_total{job=\"$job\", grpc_type=\"unary\"}[$interval]))", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "{{job}} {{grpc_method}} {{grpc_code}}", + "step": 10 + } + ], + "thresholds": [ ], + "timeFrom": null, + "timeShift": null, + "title": "Rate", + "tooltip": { + "shared": false, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [ ] + }, + "yaxes": [ + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": 0, + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": false + } + ] + }, + { + "aliasColors": { + "error": "#E24D42" + }, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "$datasource", + "description": "Shows ratio of errors compared to the total number of handled requests from queriers.", + "fill": 10, + "id": 2, + "legend": { + "avg": false, + "current": false, + "max": false, + "min": false, + "show": true, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 0, + "links": [ ], + "nullPointMode": "null as zero", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [ ], + "spaceLength": 10, + "span": 4, + "stack": true, + "steppedLine": false, + "targets": [ + { + "expr": "sum by (job) (rate(grpc_server_handled_total{grpc_code=~\"Unknown|ResourceExhausted|Internal|Unavailable|DataLoss\",job=\"$job\", grpc_type=\"unary\"}[$interval])) / sum by (job) (rate(grpc_server_handled_total{job=\"$job\", grpc_type=\"unary\"}[$interval]))", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "error", + "step": 10 + } + ], + "thresholds": [ ], + "timeFrom": null, + "timeShift": null, + "title": "Errors", + "tooltip": { + "shared": false, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [ ] + }, + "yaxes": [ + { + "format": "percentunit", + "label": null, + "logBase": 1, + "max": null, + "min": 0, + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": false + } + ] + }, + { + "aliasColors": { }, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "$datasource", + "description": "Shows how long has it taken to handle requests from queriers, in quantiles.", + "fill": 1, + "id": 3, + "legend": { + "avg": false, + "current": false, + "max": false, + "min": false, + "show": true, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 1, + "links": [ ], + "nullPointMode": "null as zero", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [ + { + "alias": "p99", + "color": "#FA6400", + "fill": 1, + "fillGradient": 1 + }, + { + "alias": "p90", + "color": "#E0B400", + "fill": 1, + "fillGradient": 1 + }, + { + "alias": "p50", + "color": "#37872D", + "fill": 10, + "fillGradient": 0 + } + ], + "spaceLength": 10, + "span": 4, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "histogram_quantile(0.50, sum by (job, le) (rate(grpc_server_handling_seconds_bucket{job=\"$job\", grpc_type=\"unary\"}[$interval]))) * 1", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "p50 {{job}}", + "logBase": 10, + "max": null, + "min": null, + "step": 10 + }, + { + "expr": "histogram_quantile(0.90, sum by (job, le) (rate(grpc_server_handling_seconds_bucket{job=\"$job\", grpc_type=\"unary\"}[$interval]))) * 1", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "p90 {{job}}", + "logBase": 10, + "max": null, + "min": null, + "step": 10 + }, + { + "expr": "histogram_quantile(0.99, sum by (job, le) (rate(grpc_server_handling_seconds_bucket{job=\"$job\", grpc_type=\"unary\"}[$interval]))) * 1", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "p99 {{job}}", + "logBase": 10, + "max": null, + "min": null, + "step": 10 + } + ], + "thresholds": [ ], + "timeFrom": null, + "timeShift": null, + "title": "Duration", + "tooltip": { + "shared": false, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [ ] + }, + "yaxes": [ + { + "format": "s", + "label": null, + "logBase": 1, + "max": null, + "min": 0, + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": false + } + ] + } + ], + "repeat": null, + "repeatIteration": null, + "repeatRowId": null, + "showTitle": true, + "title": "gRPC (Unary)", + "titleSize": "h6" + }, + { + "collapse": false, + "height": "250px", + "panels": [ + { + "aliasColors": { }, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "$datasource", + "description": "Shows rate of handled Streamed gRPC requests from queriers.", + "fill": 10, + "id": 4, + "legend": { + "avg": false, + "current": false, + "max": false, + "min": false, + "show": true, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 0, + "links": [ ], + "nullPointMode": "null as zero", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [ + { + "alias": "/Aborted/", + "color": "#EAB839" + }, + { + "alias": "/AlreadyExists/", + "color": "#37872D" + }, + { + "alias": "/FailedPrecondition/", + "color": "#E0B400" + }, + { + "alias": "/Unimplemented/", + "color": "#E0B400" + }, + { + "alias": "/InvalidArgument/", + "color": "#1F60C4" + }, + { + "alias": "/NotFound/", + "color": "#1F60C4" + }, + { + "alias": "/PermissionDenied/", + "color": "#1F60C4" + }, + { + "alias": "/Unauthenticated/", + "color": "#1F60C4" + }, + { + "alias": "/Canceled/", + "color": "#C4162A" + }, + { + "alias": "/DataLoss/", + "color": "#C4162A" + }, + { + "alias": "/DeadlineExceeded/", + "color": "#C4162A" + }, + { + "alias": "/Internal/", + "color": "#C4162A" + }, + { + "alias": "/OutOfRange/", + "color": "#C4162A" + }, + { + "alias": "/ResourceExhausted/", + "color": "#C4162A" + }, + { + "alias": "/Unavailable/", + "color": "#C4162A" + }, + { + "alias": "/Unknown/", + "color": "#C4162A" + }, + { + "alias": "/OK/", + "color": "#37872D" + }, + { + "alias": "error", + "color": "#C4162A" + } + ], + "spaceLength": 10, + "span": 4, + "stack": true, + "steppedLine": false, + "targets": [ + { + "expr": "sum by (job, grpc_method, grpc_code) (rate(grpc_server_handled_total{job=\"$job\", grpc_type=\"server_stream\"}[$interval]))", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "{{job}} {{grpc_method}} {{grpc_code}}", + "step": 10 + } + ], + "thresholds": [ ], + "timeFrom": null, + "timeShift": null, + "title": "Rate", + "tooltip": { + "shared": false, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [ ] + }, + "yaxes": [ + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": 0, + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": false + } + ] + }, + { + "aliasColors": { + "error": "#E24D42" + }, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "$datasource", + "description": "Shows ratio of errors compared to the total number of handled requests from queriers.", + "fill": 10, + "id": 5, + "legend": { + "avg": false, + "current": false, + "max": false, + "min": false, + "show": true, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 0, + "links": [ ], + "nullPointMode": "null as zero", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [ ], + "spaceLength": 10, + "span": 4, + "stack": true, + "steppedLine": false, + "targets": [ + { + "expr": "sum by (job) (rate(grpc_server_handled_total{grpc_code=~\"Unknown|ResourceExhausted|Internal|Unavailable|DataLoss\",job=\"$job\", grpc_type=\"server_stream\"}[$interval])) / sum by (job) (rate(grpc_server_handled_total{job=\"$job\", grpc_type=\"server_stream\"}[$interval]))", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "error", + "step": 10 + } + ], + "thresholds": [ ], + "timeFrom": null, + "timeShift": null, + "title": "Errors", + "tooltip": { + "shared": false, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [ ] + }, + "yaxes": [ + { + "format": "percentunit", + "label": null, + "logBase": 1, + "max": null, + "min": 0, + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": false + } + ] + }, + { + "aliasColors": { }, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "$datasource", + "description": "Shows how long has it taken to handle requests from queriers, in quantiles.", + "fill": 1, + "id": 6, + "legend": { + "avg": false, + "current": false, + "max": false, + "min": false, + "show": true, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 1, + "links": [ ], + "nullPointMode": "null as zero", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [ + { + "alias": "p99", + "color": "#FA6400", + "fill": 1, + "fillGradient": 1 + }, + { + "alias": "p90", + "color": "#E0B400", + "fill": 1, + "fillGradient": 1 + }, + { + "alias": "p50", + "color": "#37872D", + "fill": 10, + "fillGradient": 0 + } + ], + "spaceLength": 10, + "span": 4, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "histogram_quantile(0.50, sum by (job, le) (rate(grpc_server_handling_seconds_bucket{job=\"$job\", grpc_type=\"server_stream\"}[$interval]))) * 1", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "p50 {{job}}", + "logBase": 10, + "max": null, + "min": null, + "step": 10 + }, + { + "expr": "histogram_quantile(0.90, sum by (job, le) (rate(grpc_server_handling_seconds_bucket{job=\"$job\", grpc_type=\"server_stream\"}[$interval]))) * 1", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "p90 {{job}}", + "logBase": 10, + "max": null, + "min": null, + "step": 10 + }, + { + "expr": "histogram_quantile(0.99, sum by (job, le) (rate(grpc_server_handling_seconds_bucket{job=\"$job\", grpc_type=\"server_stream\"}[$interval]))) * 1", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "p99 {{job}}", + "logBase": 10, + "max": null, + "min": null, + "step": 10 + } + ], + "thresholds": [ ], + "timeFrom": null, + "timeShift": null, + "title": "Duration", + "tooltip": { + "shared": false, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [ ] + }, + "yaxes": [ + { + "format": "s", + "label": null, + "logBase": 1, + "max": null, + "min": 0, + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": false + } + ] + } + ], + "repeat": null, + "repeatIteration": null, + "repeatRowId": null, + "showTitle": true, + "title": "gRPC (Stream)", + "titleSize": "h6" + }, + { + "collapse": false, + "height": "250px", + "panels": [ + { + "aliasColors": { }, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "$datasource", + "description": "Shows rate of execution for operations against the bucket.", + "fill": 10, + "id": 7, + "legend": { + "avg": false, + "current": false, + "max": false, + "min": false, + "show": true, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 0, + "links": [ ], + "nullPointMode": "null as zero", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [ ], + "spaceLength": 10, + "span": 4, + "stack": true, + "steppedLine": false, + "targets": [ + { + "expr": "sum by (job, operation) (rate(thanos_objstore_bucket_operations_total{job=\"$job\"}[$interval]))", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "{{job}} {{operation}}", + "legendLink": null, + "step": 10 + } + ], + "thresholds": [ ], + "timeFrom": null, + "timeShift": null, + "title": "Rate", + "tooltip": { + "shared": false, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [ ] + }, + "yaxes": [ + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": 0, + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": false + } + ] + }, + { + "aliasColors": { }, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "$datasource", + "description": "Shows ratio of errors compared to the total number of executed operations against the bucket.", + "fill": 10, + "id": 8, + "legend": { + "avg": false, + "current": false, + "max": false, + "min": false, + "show": true, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 0, + "links": [ ], + "nullPointMode": "null as zero", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [ ], + "spaceLength": 10, + "span": 4, + "stack": true, + "steppedLine": false, + "targets": [ + { + "expr": "sum by (job, operation) (rate(thanos_objstore_bucket_operation_failures_total{job=\"$job\"}[$interval])) / sum by (job, operation) (rate(thanos_objstore_bucket_operations_total{job=\"$job\"}[$interval]))", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "{{job}} {{operation}}", + "legendLink": null, + "step": 10 + } + ], + "thresholds": [ ], + "timeFrom": null, + "timeShift": null, + "title": "Errors", + "tooltip": { + "shared": false, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [ ] + }, + "yaxes": [ + { + "format": "percentunit", + "label": null, + "logBase": 1, + "max": null, + "min": 0, + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": false + } + ] + }, + { + "aliasColors": { }, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "$datasource", + "description": "Shows how long has it taken to execute operations against the bucket, in quantiles.", + "fill": 1, + "id": 9, + "legend": { + "avg": false, + "current": false, + "max": false, + "min": false, + "show": true, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 1, + "links": [ ], + "nullPointMode": "null as zero", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [ ], + "spaceLength": 10, + "span": 4, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "histogram_quantile(0.99, sum by (job, operation, le) (rate(thanos_objstore_bucket_operation_duration_seconds_bucket{job=\"$job\"}[$interval]))) * 1", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "P99 {{job}}", + "refId": "A", + "step": 10 + }, + { + "expr": "sum by (job, operation) (rate(thanos_objstore_bucket_operation_duration_seconds_sum{job=\"$job\"}[$interval])) * 1 / sum by (job, operation) (rate(thanos_objstore_bucket_operation_duration_seconds_count{job=\"$job\"}[$interval]))", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "mean {{job}}", + "refId": "B", + "step": 10 + }, + { + "expr": "histogram_quantile(0.50, sum by (job, operation, le) (rate(thanos_objstore_bucket_operation_duration_seconds_bucket{job=\"$job\"}[$interval]))) * 1", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "P50 {{job}}", + "refId": "C", + "step": 10 + } + ], + "thresholds": [ ], + "timeFrom": null, + "timeShift": null, + "title": "Duration", + "tooltip": { + "shared": false, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [ ] + }, + "yaxes": [ + { + "format": "s", + "label": null, + "logBase": 1, + "max": null, + "min": 0, + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": false + } + ] + } + ], + "repeat": null, + "repeatIteration": null, + "repeatRowId": null, + "showTitle": true, + "title": "Bucket Operations", + "titleSize": "h6" + }, + { + "collapse": false, + "height": "250px", + "panels": [ + { + "aliasColors": { }, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "$datasource", + "description": "Shows rate of block loads from the bucket.", + "fill": 10, + "id": 10, + "legend": { + "avg": false, + "current": false, + "max": false, + "min": false, + "show": true, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 0, + "links": [ ], + "nullPointMode": "null as zero", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [ ], + "spaceLength": 10, + "span": 3, + "stack": true, + "steppedLine": false, + "targets": [ + { + "expr": "sum by (job) (rate(thanos_bucket_store_block_loads_total{job=\"$job\"}[$interval]))", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "block loads", + "legendLink": null, + "step": 10 + } + ], + "thresholds": [ ], + "timeFrom": null, + "timeShift": null, + "title": "Block Load Rate", + "tooltip": { + "shared": false, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [ ] + }, + "yaxes": [ + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": 0, + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": false + } + ] + }, + { + "aliasColors": { + "error": "#E24D42" + }, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "$datasource", + "description": "Shows ratio of errors compared to the total number of block loads from the bucket.", + "fill": 10, + "id": 11, + "legend": { + "avg": false, + "current": false, + "max": false, + "min": false, + "show": true, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 0, + "links": [ ], + "nullPointMode": "null as zero", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [ ], + "spaceLength": 10, + "span": 3, + "stack": true, + "steppedLine": false, + "targets": [ + { + "expr": "sum by (job) (rate(thanos_bucket_store_block_load_failures_total{job=\"$job\"}[$interval])) / sum by (job) (rate(thanos_bucket_store_block_loads_total{job=\"$job\"}[$interval]))", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "error", + "step": 10 + } + ], + "thresholds": [ ], + "timeFrom": null, + "timeShift": null, + "title": "Block Load Errors", + "tooltip": { + "shared": false, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [ ] + }, + "yaxes": [ + { + "format": "percentunit", + "label": null, + "logBase": 1, + "max": null, + "min": 0, + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": false + } + ] + }, + { + "aliasColors": { }, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "$datasource", + "description": "Shows rate of block drops.", + "fill": 10, + "id": 12, + "legend": { + "avg": false, + "current": false, + "max": false, + "min": false, + "show": true, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 0, + "links": [ ], + "nullPointMode": "null as zero", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [ ], + "spaceLength": 10, + "span": 3, + "stack": true, + "steppedLine": false, + "targets": [ + { + "expr": "sum by (job, operation) (rate(thanos_bucket_store_block_drops_total{job=\"$job\"}[$interval]))", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "block drops {{job}}", + "legendLink": null, + "step": 10 + } + ], + "thresholds": [ ], + "timeFrom": null, + "timeShift": null, + "title": "Block Drop Rate", + "tooltip": { + "shared": false, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [ ] + }, + "yaxes": [ + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": 0, + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": false + } + ] + }, + { + "aliasColors": { + "error": "#E24D42" + }, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "$datasource", + "description": "Shows ratio of errors compared to the total number of block drops.", + "fill": 10, + "id": 13, + "legend": { + "avg": false, + "current": false, + "max": false, + "min": false, + "show": true, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 0, + "links": [ ], + "nullPointMode": "null as zero", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [ ], + "spaceLength": 10, + "span": 3, + "stack": true, + "steppedLine": false, + "targets": [ + { + "expr": "sum by (job) (rate(thanos_bucket_store_block_drop_failures_total{job=\"$job\"}[$interval])) / sum by (job) (rate(thanos_bucket_store_block_drops_total{job=\"$job\"}[$interval]))", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "error", + "step": 10 + } + ], + "thresholds": [ ], + "timeFrom": null, + "timeShift": null, + "title": "Block Drop Errors", + "tooltip": { + "shared": false, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [ ] + }, + "yaxes": [ + { + "format": "percentunit", + "label": null, + "logBase": 1, + "max": null, + "min": 0, + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": false + } + ] + } + ], + "repeat": null, + "repeatIteration": null, + "repeatRowId": null, + "showTitle": true, + "title": "Block Operations", + "titleSize": "h6" + }, + { + "collapse": false, + "height": "250px", + "panels": [ + { + "aliasColors": { }, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "$datasource", + "description": "Show rate of cache requests.", + "fill": 10, + "id": 14, + "legend": { + "avg": false, + "current": false, + "max": false, + "min": false, + "show": true, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 0, + "links": [ ], + "nullPointMode": "null as zero", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [ ], + "spaceLength": 10, + "span": 3, + "stack": true, + "steppedLine": false, + "targets": [ + { + "expr": "sum by (job, item_type) (rate(thanos_store_index_cache_requests_total{job=\"$job\"}[$interval]))", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "{{job}} {{item_type}}", + "legendLink": null, + "step": 10 + } + ], + "thresholds": [ ], + "timeFrom": null, + "timeShift": null, + "title": "Requests", + "tooltip": { + "shared": false, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [ ] + }, + "yaxes": [ + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": 0, + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": false + } + ] + }, + { + "aliasColors": { }, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "$datasource", + "description": "Shows ratio of errors compared to the total number of cache hits.", + "fill": 10, + "id": 15, + "legend": { + "avg": false, + "current": false, + "max": false, + "min": false, + "show": true, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 0, + "links": [ ], + "nullPointMode": "null as zero", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [ ], + "spaceLength": 10, + "span": 3, + "stack": true, + "steppedLine": false, + "targets": [ + { + "expr": "sum by (job, item_type) (rate(thanos_store_index_cache_hits_total{job=\"$job\"}[$interval]))", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "{{job}} {{item_type}}", + "legendLink": null, + "step": 10 + } + ], + "thresholds": [ ], + "timeFrom": null, + "timeShift": null, + "title": "Hits", + "tooltip": { + "shared": false, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [ ] + }, + "yaxes": [ + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": 0, + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": false + } + ] + }, + { + "aliasColors": { }, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "$datasource", + "description": "Show rate of added items to cache.", + "fill": 10, + "id": 16, + "legend": { + "avg": false, + "current": false, + "max": false, + "min": false, + "show": true, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 0, + "links": [ ], + "nullPointMode": "null as zero", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [ ], + "spaceLength": 10, + "span": 3, + "stack": true, + "steppedLine": false, + "targets": [ + { + "expr": "sum by (job, item_type) (rate(thanos_store_index_cache_items_added_total{job=\"$job\"}[$interval]))", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "{{job}} {{item_type}}", + "legendLink": null, + "step": 10 + } + ], + "thresholds": [ ], + "timeFrom": null, + "timeShift": null, + "title": "Added", + "tooltip": { + "shared": false, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [ ] + }, + "yaxes": [ + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": 0, + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": false + } + ] + }, + { + "aliasColors": { }, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "$datasource", + "description": "Show rate of evicted items from cache.", + "fill": 10, + "id": 17, + "legend": { + "avg": false, + "current": false, + "max": false, + "min": false, + "show": true, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 0, + "links": [ ], + "nullPointMode": "null as zero", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [ ], + "spaceLength": 10, + "span": 3, + "stack": true, + "steppedLine": false, + "targets": [ + { + "expr": "sum by (job, item_type) (rate(thanos_store_index_cache_items_evicted_total{job=\"$job\"}[$interval]))", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "{{job}} {{item_type}}", + "legendLink": null, + "step": 10 + } + ], + "thresholds": [ ], + "timeFrom": null, + "timeShift": null, + "title": "Evicted", + "tooltip": { + "shared": false, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [ ] + }, + "yaxes": [ + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": 0, + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": false + } + ] + } + ], + "repeat": null, + "repeatIteration": null, + "repeatRowId": null, + "showTitle": true, + "title": "Cache Operations", + "titleSize": "h6" + }, + { + "collapse": false, + "height": "250px", + "panels": [ + { + "aliasColors": { }, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "$datasource", + "description": "Shows size of chunks that have sent to the bucket.", + "fill": 1, + "id": 18, + "legend": { + "avg": false, + "current": false, + "max": false, + "min": false, + "show": true, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 1, + "links": [ ], + "nullPointMode": "null as zero", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [ ], + "spaceLength": 10, + "span": 12, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "histogram_quantile(0.99, sum by (job, le) (rate(thanos_bucket_store_sent_chunk_size_bytes_bucket{job=\"$job\"}[$interval])))", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "P99", + "legendLink": null, + "step": 10 + }, + { + "expr": "sum by (job) (rate(thanos_bucket_store_sent_chunk_size_bytes_sum{job=\"$job\"}[$interval])) / sum by (job) (rate(thanos_bucket_store_sent_chunk_size_bytes_count{job=\"$job\"}[$interval]))", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "mean", + "legendLink": null, + "step": 10 + }, + { + "expr": "histogram_quantile(0.99, sum by (job, le) (rate(thanos_bucket_store_sent_chunk_size_bytes_bucket{job=\"$job\"}[$interval])))", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "P50", + "legendLink": null, + "step": 10 + } + ], + "thresholds": [ ], + "timeFrom": null, + "timeShift": null, + "title": "Chunk Size", + "tooltip": { + "shared": false, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [ ] + }, + "yaxes": [ + { + "format": "bytes", + "label": null, + "logBase": 1, + "max": null, + "min": 0, + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": false + } + ] + } + ], + "repeat": null, + "repeatIteration": null, + "repeatRowId": null, + "showTitle": true, + "title": "Store Sent", + "titleSize": "h6" + }, + { + "collapse": false, + "height": "250px", + "panels": [ + { + "aliasColors": { }, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "$datasource", + "fill": 1, + "id": 19, + "legend": { + "avg": false, + "current": false, + "max": false, + "min": false, + "show": true, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 1, + "links": [ ], + "nullPointMode": "null as zero", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [ ], + "spaceLength": 10, + "span": 4, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "thanos_bucket_store_series_blocks_queried{job=\"$job\", quantile=\"0.99\"}", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "P99", + "legendLink": null, + "step": 10 + }, + { + "expr": "sum by (job) (rate(thanos_bucket_store_series_blocks_queried_sum{job=\"$job\"}[$interval])) / sum by (job) (rate(thanos_bucket_store_series_blocks_queried_count{job=\"$job\"}[$interval]))", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "mean {{job}}", + "legendLink": null, + "step": 10 + }, + { + "expr": "thanos_bucket_store_series_blocks_queried{job=\"$job\", quantile=\"0.50\"}", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "P50", + "legendLink": null, + "step": 10 + } + ], + "thresholds": [ ], + "timeFrom": null, + "timeShift": null, + "title": "Block queried", + "tooltip": { + "shared": false, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [ ] + }, + "yaxes": [ + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": 0, + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": false + } + ] + }, + { + "aliasColors": { }, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "$datasource", + "description": "Show the size of data fetched", + "fill": 1, + "id": 20, + "legend": { + "avg": false, + "current": false, + "max": false, + "min": false, + "show": true, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 1, + "links": [ ], + "nullPointMode": "null as zero", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [ ], + "spaceLength": 10, + "span": 4, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "thanos_bucket_store_series_data_fetched{job=\"$job\", quantile=\"0.99\"}", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "P99", + "legendLink": null, + "step": 10 + }, + { + "expr": "sum by (job) (rate(thanos_bucket_store_series_data_fetched_sum{job=\"$job\"}[$interval])) / sum by (job) (rate(thanos_bucket_store_series_data_fetched_count{job=\"$job\"}[$interval]))", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "mean {{job}}", + "legendLink": null, + "step": 10 + }, + { + "expr": "thanos_bucket_store_series_data_fetched{job=\"$job\", quantile=\"0.50\"}", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "P50", + "legendLink": null, + "step": 10 + } + ], + "thresholds": [ ], + "timeFrom": null, + "timeShift": null, + "title": "Data Fetched", + "tooltip": { + "shared": false, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [ ] + }, + "yaxes": [ + { + "format": "bytes", + "label": null, + "logBase": 1, + "max": null, + "min": 0, + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": false + } + ] + }, + { + "aliasColors": { }, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "$datasource", + "fill": 1, + "id": 21, + "legend": { + "avg": false, + "current": false, + "max": false, + "min": false, + "show": true, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 1, + "links": [ ], + "nullPointMode": "null as zero", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [ ], + "spaceLength": 10, + "span": 4, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "thanos_bucket_store_series_result_series{job=\"$job\",quantile=\"0.99\"}", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "P99", + "legendLink": null, + "step": 10 + }, + { + "expr": "sum by (job) (rate(thanos_bucket_store_series_result_series_sum{job=\"$job\"}[$interval])) / sum by (job) (rate(thanos_bucket_store_series_result_series_count{job=\"$job\"}[$interval]))", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "mean {{job}}", + "legendLink": null, + "step": 10 + }, + { + "expr": "thanos_bucket_store_series_result_series{job=\"$job\",quantile=\"0.50\"}", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "P50", + "legendLink": null, + "step": 10 + } + ], + "thresholds": [ ], + "timeFrom": null, + "timeShift": null, + "title": "Result series", + "tooltip": { + "shared": false, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [ ] + }, + "yaxes": [ + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": 0, + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": false + } + ] + } + ], + "repeat": null, + "repeatIteration": null, + "repeatRowId": null, + "showTitle": true, + "title": "Series Operations", + "titleSize": "h6" + }, + { + "collapse": false, + "height": "250px", + "panels": [ + { + "aliasColors": { }, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "$datasource", + "description": "Shows how long has it taken to get all series.", + "fill": 1, + "id": 22, + "legend": { + "avg": false, + "current": false, + "max": false, + "min": false, + "show": true, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 1, + "links": [ ], + "nullPointMode": "null as zero", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [ + { + "alias": "p99", + "color": "#FA6400", + "fill": 1, + "fillGradient": 1 + }, + { + "alias": "p90", + "color": "#E0B400", + "fill": 1, + "fillGradient": 1 + }, + { + "alias": "p50", + "color": "#37872D", + "fill": 10, + "fillGradient": 0 + } + ], + "spaceLength": 10, + "span": 4, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "histogram_quantile(0.50, sum by (job, le) (rate(thanos_bucket_store_series_get_all_duration_seconds_bucket{job=\"$job\"}[$interval]))) * 1", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "p50 {{job}}", + "logBase": 10, + "max": null, + "min": null, + "step": 10 + }, + { + "expr": "histogram_quantile(0.90, sum by (job, le) (rate(thanos_bucket_store_series_get_all_duration_seconds_bucket{job=\"$job\"}[$interval]))) * 1", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "p90 {{job}}", + "logBase": 10, + "max": null, + "min": null, + "step": 10 + }, + { + "expr": "histogram_quantile(0.99, sum by (job, le) (rate(thanos_bucket_store_series_get_all_duration_seconds_bucket{job=\"$job\"}[$interval]))) * 1", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "p99 {{job}}", + "logBase": 10, + "max": null, + "min": null, + "step": 10 + } + ], + "thresholds": [ ], + "timeFrom": null, + "timeShift": null, + "title": "Get All", + "tooltip": { + "shared": false, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [ ] + }, + "yaxes": [ + { + "format": "s", + "label": null, + "logBase": 1, + "max": null, + "min": 0, + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": false + } + ] + }, + { + "aliasColors": { }, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "$datasource", + "description": "Shows how long has it taken to merge series.", + "fill": 1, + "id": 23, + "legend": { + "avg": false, + "current": false, + "max": false, + "min": false, + "show": true, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 1, + "links": [ ], + "nullPointMode": "null as zero", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [ + { + "alias": "p99", + "color": "#FA6400", + "fill": 1, + "fillGradient": 1 + }, + { + "alias": "p90", + "color": "#E0B400", + "fill": 1, + "fillGradient": 1 + }, + { + "alias": "p50", + "color": "#37872D", + "fill": 10, + "fillGradient": 0 + } + ], + "spaceLength": 10, + "span": 4, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "histogram_quantile(0.50, sum by (job, le) (rate(thanos_bucket_store_series_merge_duration_seconds_bucket{job=\"$job\"}[$interval]))) * 1", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "p50 {{job}}", + "logBase": 10, + "max": null, + "min": null, + "step": 10 + }, + { + "expr": "histogram_quantile(0.90, sum by (job, le) (rate(thanos_bucket_store_series_merge_duration_seconds_bucket{job=\"$job\"}[$interval]))) * 1", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "p90 {{job}}", + "logBase": 10, + "max": null, + "min": null, + "step": 10 + }, + { + "expr": "histogram_quantile(0.99, sum by (job, le) (rate(thanos_bucket_store_series_merge_duration_seconds_bucket{job=\"$job\"}[$interval]))) * 1", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "p99 {{job}}", + "logBase": 10, + "max": null, + "min": null, + "step": 10 + } + ], + "thresholds": [ ], + "timeFrom": null, + "timeShift": null, + "title": "Merge", + "tooltip": { + "shared": false, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [ ] + }, + "yaxes": [ + { + "format": "s", + "label": null, + "logBase": 1, + "max": null, + "min": 0, + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": false + } + ] + }, + { + "aliasColors": { }, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "$datasource", + "description": "Shows how long has it taken for a series to wait at the gate.", + "fill": 1, + "id": 24, + "legend": { + "avg": false, + "current": false, + "max": false, + "min": false, + "show": true, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 1, + "links": [ ], + "nullPointMode": "null as zero", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [ + { + "alias": "p99", + "color": "#FA6400", + "fill": 1, + "fillGradient": 1 + }, + { + "alias": "p90", + "color": "#E0B400", + "fill": 1, + "fillGradient": 1 + }, + { + "alias": "p50", + "color": "#37872D", + "fill": 10, + "fillGradient": 0 + } + ], + "spaceLength": 10, + "span": 4, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "histogram_quantile(0.50, sum by (job, le) (rate(thanos_bucket_store_series_gate_duration_seconds_bucket{job=\"$job\"}[$interval]))) * 1", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "p50 {{job}}", + "logBase": 10, + "max": null, + "min": null, + "step": 10 + }, + { + "expr": "histogram_quantile(0.90, sum by (job, le) (rate(thanos_bucket_store_series_gate_duration_seconds_bucket{job=\"$job\"}[$interval]))) * 1", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "p90 {{job}}", + "logBase": 10, + "max": null, + "min": null, + "step": 10 + }, + { + "expr": "histogram_quantile(0.99, sum by (job, le) (rate(thanos_bucket_store_series_gate_duration_seconds_bucket{job=\"$job\"}[$interval]))) * 1", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "p99 {{job}}", + "logBase": 10, + "max": null, + "min": null, + "step": 10 + } + ], + "thresholds": [ ], + "timeFrom": null, + "timeShift": null, + "title": "Gate", + "tooltip": { + "shared": false, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [ ] + }, + "yaxes": [ + { + "format": "s", + "label": null, + "logBase": 1, + "max": null, + "min": 0, + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": false + } + ] + } + ], + "repeat": null, + "repeatIteration": null, + "repeatRowId": null, + "showTitle": true, + "title": "Series Operation Durations", + "titleSize": "h6" + }, + { + "collapse": true, + "height": "250px", + "panels": [ + { + "aliasColors": { }, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "$datasource", + "fill": 1, + "id": 25, + "legend": { + "avg": false, + "current": false, + "max": false, + "min": false, + "show": true, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 1, + "links": [ ], + "nullPointMode": "null as zero", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [ ], + "spaceLength": 10, + "span": 4, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "go_memstats_alloc_bytes{job=\"$job\"}", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "alloc all {{instance}}", + "legendLink": null, + "step": 10 + }, + { + "expr": "go_memstats_heap_alloc_bytes{job=\"$job\"}", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "alloc heap {{instance}}", + "legendLink": null, + "step": 10 + }, + { + "expr": "rate(go_memstats_alloc_bytes_total{job=\"$job\"}[30s])", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "alloc rate all {{instance}}", + "legendLink": null, + "step": 10 + }, + { + "expr": "rate(go_memstats_heap_alloc_bytes{job=\"$job\"}[30s])", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "alloc rate heap {{instance}}", + "legendLink": null, + "step": 10 + }, + { + "expr": "go_memstats_stack_inuse_bytes{job=\"$job\"}", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "inuse heap {{instance}}", + "legendLink": null, + "step": 10 + }, + { + "expr": "go_memstats_heap_inuse_bytes{job=\"$job\"}", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "inuse stack {{instance}}", + "legendLink": null, + "step": 10 + } + ], + "thresholds": [ ], + "timeFrom": null, + "timeShift": null, + "title": "Memory Used", + "tooltip": { + "shared": false, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [ ] + }, + "yaxes": [ + { + "format": "bytes", + "label": null, + "logBase": 1, + "max": null, + "min": 0, + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": false + } + ] + }, + { + "aliasColors": { }, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "$datasource", + "fill": 1, + "id": 26, + "legend": { + "avg": false, + "current": false, + "max": false, + "min": false, + "show": true, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 1, + "links": [ ], + "nullPointMode": "null as zero", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [ ], + "spaceLength": 10, + "span": 4, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "go_goroutines{job=\"$job\"}", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "{{instance}}", + "legendLink": null, + "step": 10 + } + ], + "thresholds": [ ], + "timeFrom": null, + "timeShift": null, + "title": "Goroutines", + "tooltip": { + "shared": false, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [ ] + }, + "yaxes": [ + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": 0, + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": false + } + ] + }, + { + "aliasColors": { }, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "$datasource", + "fill": 1, + "id": 27, + "legend": { + "avg": false, + "current": false, + "max": false, + "min": false, + "show": true, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 1, + "links": [ ], + "nullPointMode": "null as zero", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [ ], + "spaceLength": 10, + "span": 4, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "go_gc_duration_seconds{job=\"$job\"}", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "{{quantile}} {{instance}}", + "legendLink": null, + "step": 10 + } + ], + "thresholds": [ ], + "timeFrom": null, + "timeShift": null, + "title": "GC Time Quantiles", + "tooltip": { + "shared": false, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [ ] + }, + "yaxes": [ + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": 0, + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": false + } + ] + } + ], + "repeat": null, + "repeatIteration": null, + "repeatRowId": null, + "showTitle": true, + "title": "Resources", + "titleSize": "h6" + } + ], + "schemaVersion": 14, + "style": "dark", + "tags": [ + "thanos-mixin", + "portefaix-v0.6.0" + ], + "templating": { + "list": [ + { + "current": { + "text": "default", + "value": "default" + }, + "hide": 0, + "label": null, + "name": "datasource", + "options": [ ], + "query": "prometheus", + "refresh": 1, + "regex": "", + "type": "datasource" + }, + { + "auto": true, + "auto_count": 300, + "auto_min": "10s", + "current": { + "text": "5m", + "value": "5m" + }, + "hide": 0, + "label": "interval", + "name": "interval", + "query": "5m,10m,30m,1h,6h,12h", + "refresh": 2, + "type": "interval" + }, + { + "allValue": null, + "current": { + "text": "all", + "value": "$__all" + }, + "datasource": "$datasource", + "hide": 0, + "includeAll": true, + "label": "job", + "multi": false, + "name": "job", + "options": [ ], + "query": "label_values(up{job=~\".*thanos-store.*\"}, job)", + "refresh": 1, + "regex": "", + "sort": 2, + "tagValuesQuery": "", + "tags": [ ], + "tagsQuery": "", + "type": "query", + "useTags": false + } + ] + }, + "time": { + "from": "now-1h", + "to": "now" + }, + "timepicker": { + "refresh_intervals": [ + "5s", + "10s", + "30s", + "1m", + "5m", + "15m", + "30m", + "1h", + "2h", + "1d" + ], + "time_options": [ + "5m", + "15m", + "1h", + "6h", + "12h", + "24h", + "2d", + "7d", + "30d" + ] + }, + "timezone": "UTC", + "title": "Thanos / Store", + "uid": "e832e8f26403d95fac0ea1c59837588b", + "version": 0 +} diff --git a/charts/thanos-mixin/templates/_helpers.tpl b/charts/thanos-mixin/templates/_helpers.tpl index 544fb2d8..eff51d1f 100644 --- a/charts/thanos-mixin/templates/_helpers.tpl +++ b/charts/thanos-mixin/templates/_helpers.tpl @@ -47,7 +47,7 @@ Common labels helm.sh/chart: {{ include "thanos-mixin.chart" . }} {{ include "thanos-mixin.selectorLabels" . }} {{- if .Chart.AppVersion }} -app.kubernetes.io/version: {{ .Chart.Version | quote }} +app.kubernetes.io/version: {{ .Chart.AppVersion | quote }} {{- end }} app.kubernetes.io/component: monitoring-mixin app.kubernetes.io/part-of: {{ include "thanos-mixin.name" . }} diff --git a/charts/thanos-mixin/templates/alerts.yaml b/charts/thanos-mixin/templates/alerts.yaml index 77310023..337afbe5 100644 --- a/charts/thanos-mixin/templates/alerts.yaml +++ b/charts/thanos-mixin/templates/alerts.yaml @@ -21,32 +21,32 @@ spec: rules: - alert: ThanosCompactMultipleRunning annotations: - description: No more than one Thanos Compact instance should be running at once. There are {{`{{`}} $value {{`}}`}} - runbook_url: https://github.com/thanos-io/thanos/tree/master/mixin/runbook.md#alert-name-thanoscompactmultiplerunning + description: No more than one Thanos Compact instance should be running at once. There are {{`{{`}}$value{{`}}`}} instances running. + runbook_url: https://github.com/thanos-io/thanos/tree/main/mixin/runbook.md#alert-name-thanoscompactmultiplerunning summary: Thanos Compact has multiple instances running. - expr: sum(up{job=~"thanos-compact.*"}) > 1 + expr: sum by (job) (up{job=~".*thanos-compact.*"}) > 1 for: 5m labels: severity: warning - alert: ThanosCompactHalted annotations: description: Thanos Compact {{`{{`}}$labels.job{{`}}`}} has failed to run and now is halted. - runbook_url: https://github.com/thanos-io/thanos/tree/master/mixin/runbook.md#alert-name-thanoscompacthalted + runbook_url: https://github.com/thanos-io/thanos/tree/main/mixin/runbook.md#alert-name-thanoscompacthalted summary: Thanos Compact has failed to run ans is now halted. - expr: thanos_compact_halted{job=~"thanos-compact.*"} == 1 + expr: thanos_compact_halted{job=~".*thanos-compact.*"} == 1 for: 5m labels: severity: warning - alert: ThanosCompactHighCompactionFailures annotations: - description: Thanos Compact {{`{{`}}$labels.job{{`}}`}} is failing to execute {{`{{`}} $value | humanize {{`}}`}}% of compactions. - runbook_url: https://github.com/thanos-io/thanos/tree/master/mixin/runbook.md#alert-name-thanoscompacthighcompactionfailures + description: Thanos Compact {{`{{`}}$labels.job{{`}}`}} is failing to execute {{`{{`}}$value | humanize{{`}}`}}% of compactions. + runbook_url: https://github.com/thanos-io/thanos/tree/main/mixin/runbook.md#alert-name-thanoscompacthighcompactionfailures summary: Thanos Compact is failing to execute compactions. expr: | ( - sum by (job) (rate(thanos_compact_group_compactions_failures_total{job=~"thanos-compact.*"}[5m])) + sum by (job) (rate(thanos_compact_group_compactions_failures_total{job=~".*thanos-compact.*"}[5m])) / - sum by (job) (rate(thanos_compact_group_compactions_total{job=~"thanos-compact.*"}[5m])) + sum by (job) (rate(thanos_compact_group_compactions_total{job=~".*thanos-compact.*"}[5m])) * 100 > 5 ) for: 15m @@ -54,14 +54,14 @@ spec: severity: warning - alert: ThanosCompactBucketHighOperationFailures annotations: - description: Thanos Compact {{`{{`}}$labels.job{{`}}`}} Bucket is failing to execute {{`{{`}} $value | humanize {{`}}`}}% of operations. - runbook_url: https://github.com/thanos-io/thanos/tree/master/mixin/runbook.md#alert-name-thanoscompactbuckethighoperationfailures + description: Thanos Compact {{`{{`}}$labels.job{{`}}`}} Bucket is failing to execute {{`{{`}}$value | humanize{{`}}`}}% of operations. + runbook_url: https://github.com/thanos-io/thanos/tree/main/mixin/runbook.md#alert-name-thanoscompactbuckethighoperationfailures summary: Thanos Compact Bucket is having a high number of operation failures. expr: | ( - sum by (job) (rate(thanos_objstore_bucket_operation_failures_total{job=~"thanos-compact.*"}[5m])) + sum by (job) (rate(thanos_objstore_bucket_operation_failures_total{job=~".*thanos-compact.*"}[5m])) / - sum by (job) (rate(thanos_objstore_bucket_operations_total{job=~"thanos-compact.*"}[5m])) + sum by (job) (rate(thanos_objstore_bucket_operations_total{job=~".*thanos-compact.*"}[5m])) * 100 > 5 ) for: 15m @@ -70,51 +70,51 @@ spec: - alert: ThanosCompactHasNotRun annotations: description: Thanos Compact {{`{{`}}$labels.job{{`}}`}} has not uploaded anything for 24 hours. - runbook_url: https://github.com/thanos-io/thanos/tree/master/mixin/runbook.md#alert-name-thanoscompacthasnotrun + runbook_url: https://github.com/thanos-io/thanos/tree/main/mixin/runbook.md#alert-name-thanoscompacthasnotrun summary: Thanos Compact has not uploaded anything for last 24 hours. - expr: (time() - max(max_over_time(thanos_objstore_bucket_last_successful_upload_time{job=~"thanos-compact.*"}[24h]))) / 60 / 60 > 24 + expr: (time() - max by (job) (max_over_time(thanos_objstore_bucket_last_successful_upload_time{job=~".*thanos-compact.*"}[24h]))) / 60 / 60 > 24 labels: severity: warning - name: thanos-query rules: - alert: ThanosQueryHttpRequestQueryErrorRateHigh annotations: - description: Thanos Query {{`{{`}}$labels.job{{`}}`}} is failing to handle {{`{{`}} $value | humanize {{`}}`}}% of "query" requests. - runbook_url: https://github.com/thanos-io/thanos/tree/master/mixin/runbook.md#alert-name-thanosqueryhttprequestqueryerrorratehigh + description: Thanos Query {{`{{`}}$labels.job{{`}}`}} is failing to handle {{`{{`}}$value | humanize{{`}}`}}% of "query" requests. + runbook_url: https://github.com/thanos-io/thanos/tree/main/mixin/runbook.md#alert-name-thanosqueryhttprequestqueryerrorratehigh summary: Thanos Query is failing to handle requests. expr: | ( - sum(rate(http_requests_total{code=~"5..", job=~"thanos-query.*", handler="query"}[5m])) + sum by (job) (rate(http_requests_total{code=~"5..", job=~".*thanos-query.*", handler="query"}[5m])) / - sum(rate(http_requests_total{job=~"thanos-query.*", handler="query"}[5m])) + sum by (job) (rate(http_requests_total{job=~".*thanos-query.*", handler="query"}[5m])) ) * 100 > 5 for: 5m labels: severity: critical - alert: ThanosQueryHttpRequestQueryRangeErrorRateHigh annotations: - description: Thanos Query {{`{{`}}$labels.job{{`}}`}} is failing to handle {{`{{`}} $value | humanize {{`}}`}}% of "query_range" requests. - runbook_url: https://github.com/thanos-io/thanos/tree/master/mixin/runbook.md#alert-name-thanosqueryhttprequestqueryrangeerrorratehigh + description: Thanos Query {{`{{`}}$labels.job{{`}}`}} is failing to handle {{`{{`}}$value | humanize{{`}}`}}% of "query_range" requests. + runbook_url: https://github.com/thanos-io/thanos/tree/main/mixin/runbook.md#alert-name-thanosqueryhttprequestqueryrangeerrorratehigh summary: Thanos Query is failing to handle requests. expr: | ( - sum(rate(http_requests_total{code=~"5..", job=~"thanos-query.*", handler="query_range"}[5m])) + sum by (job) (rate(http_requests_total{code=~"5..", job=~".*thanos-query.*", handler="query_range"}[5m])) / - sum(rate(http_requests_total{job=~"thanos-query.*", handler="query_range"}[5m])) + sum by (job) (rate(http_requests_total{job=~".*thanos-query.*", handler="query_range"}[5m])) ) * 100 > 5 for: 5m labels: severity: critical - alert: ThanosQueryGrpcServerErrorRate annotations: - description: Thanos Query {{`{{`}}$labels.job{{`}}`}} is failing to handle {{`{{`}} $value | humanize {{`}}`}}% of requests. - runbook_url: https://github.com/thanos-io/thanos/tree/master/mixin/runbook.md#alert-name-thanosquerygrpcservererrorrate + description: Thanos Query {{`{{`}}$labels.job{{`}}`}} is failing to handle {{`{{`}}$value | humanize{{`}}`}}% of requests. + runbook_url: https://github.com/thanos-io/thanos/tree/main/mixin/runbook.md#alert-name-thanosquerygrpcservererrorrate summary: Thanos Query is failing to handle requests. expr: | ( - sum by (job) (rate(grpc_server_handled_total{grpc_code=~"Unknown|ResourceExhausted|Internal|Unavailable|DataLoss|DeadlineExceeded", job=~"thanos-query.*"}[5m])) + sum by (job) (rate(grpc_server_handled_total{grpc_code=~"Unknown|ResourceExhausted|Internal|Unavailable|DataLoss|DeadlineExceeded", job=~".*thanos-query.*"}[5m])) / - sum by (job) (rate(grpc_server_started_total{job=~"thanos-query.*"}[5m])) + sum by (job) (rate(grpc_server_started_total{job=~".*thanos-query.*"}[5m])) * 100 > 5 ) for: 5m @@ -122,56 +122,56 @@ spec: severity: warning - alert: ThanosQueryGrpcClientErrorRate annotations: - description: Thanos Query {{`{{`}}$labels.job{{`}}`}} is failing to send {{`{{`}} $value | humanize {{`}}`}}% of requests. - runbook_url: https://github.com/thanos-io/thanos/tree/master/mixin/runbook.md#alert-name-thanosquerygrpcclienterrorrate + description: Thanos Query {{`{{`}}$labels.job{{`}}`}} is failing to send {{`{{`}}$value | humanize{{`}}`}}% of requests. + runbook_url: https://github.com/thanos-io/thanos/tree/main/mixin/runbook.md#alert-name-thanosquerygrpcclienterrorrate summary: Thanos Query is failing to send requests. expr: | ( - sum by (job) (rate(grpc_client_handled_total{grpc_code!="OK", job=~"thanos-query.*"}[5m])) + sum by (job) (rate(grpc_client_handled_total{grpc_code!="OK", job=~".*thanos-query.*"}[5m])) / - sum by (job) (rate(grpc_client_started_total{job=~"thanos-query.*"}[5m])) + sum by (job) (rate(grpc_client_started_total{job=~".*thanos-query.*"}[5m])) ) * 100 > 5 for: 5m labels: severity: warning - alert: ThanosQueryHighDNSFailures annotations: - description: Thanos Query {{`{{`}}$labels.job{{`}}`}} have {{`{{`}} $value | humanize {{`}}`}}% of failing DNS queries for store endpoints. - runbook_url: https://github.com/thanos-io/thanos/tree/master/mixin/runbook.md#alert-name-thanosqueryhighdnsfailures + description: Thanos Query {{`{{`}}$labels.job{{`}}`}} have {{`{{`}}$value | humanize{{`}}`}}% of failing DNS queries for store endpoints. + runbook_url: https://github.com/thanos-io/thanos/tree/main/mixin/runbook.md#alert-name-thanosqueryhighdnsfailures summary: Thanos Query is having high number of DNS failures. expr: | ( - sum by (job) (rate(thanos_query_store_apis_dns_failures_total{job=~"thanos-query.*"}[5m])) + sum by (job) (rate(thanos_query_store_apis_dns_failures_total{job=~".*thanos-query.*"}[5m])) / - sum by (job) (rate(thanos_query_store_apis_dns_lookups_total{job=~"thanos-query.*"}[5m])) + sum by (job) (rate(thanos_query_store_apis_dns_lookups_total{job=~".*thanos-query.*"}[5m])) ) * 100 > 1 for: 15m labels: severity: warning - alert: ThanosQueryInstantLatencyHigh annotations: - description: Thanos Query {{`{{`}}$labels.job{{`}}`}} has a 99th percentile latency of {{`{{`}} $value {{`}}`}} seconds for instant queries. - runbook_url: https://github.com/thanos-io/thanos/tree/master/mixin/runbook.md#alert-name-thanosqueryinstantlatencyhigh + description: Thanos Query {{`{{`}}$labels.job{{`}}`}} has a 99th percentile latency of {{`{{`}}$value{{`}}`}} seconds for instant queries. + runbook_url: https://github.com/thanos-io/thanos/tree/main/mixin/runbook.md#alert-name-thanosqueryinstantlatencyhigh summary: Thanos Query has high latency for queries. expr: | ( - histogram_quantile(0.99, sum by (job, le) (rate(http_request_duration_seconds_bucket{job=~"thanos-query.*", handler="query"}[5m]))) > 40 + histogram_quantile(0.99, sum by (job, le) (rate(http_request_duration_seconds_bucket{job=~".*thanos-query.*", handler="query"}[5m]))) > 40 and - sum by (job) (rate(http_request_duration_seconds_bucket{job=~"thanos-query.*", handler="query"}[5m])) > 0 + sum by (job) (rate(http_request_duration_seconds_bucket{job=~".*thanos-query.*", handler="query"}[5m])) > 0 ) for: 10m labels: severity: critical - alert: ThanosQueryRangeLatencyHigh annotations: - description: Thanos Query {{`{{`}}$labels.job{{`}}`}} has a 99th percentile latency of {{`{{`}} $value {{`}}`}} seconds for range queries. - runbook_url: https://github.com/thanos-io/thanos/tree/master/mixin/runbook.md#alert-name-thanosqueryrangelatencyhigh + description: Thanos Query {{`{{`}}$labels.job{{`}}`}} has a 99th percentile latency of {{`{{`}}$value{{`}}`}} seconds for range queries. + runbook_url: https://github.com/thanos-io/thanos/tree/main/mixin/runbook.md#alert-name-thanosqueryrangelatencyhigh summary: Thanos Query has high latency for queries. expr: | ( - histogram_quantile(0.99, sum by (job, le) (rate(http_request_duration_seconds_bucket{job=~"thanos-query.*", handler="query_range"}[5m]))) > 90 + histogram_quantile(0.99, sum by (job, le) (rate(http_request_duration_seconds_bucket{job=~".*thanos-query.*", handler="query_range"}[5m]))) > 90 and - sum by (job) (rate(http_request_duration_seconds_count{job=~"thanos-query.*", handler="query_range"}[5m])) > 0 + sum by (job) (rate(http_request_duration_seconds_count{job=~".*thanos-query.*", handler="query_range"}[5m])) > 0 ) for: 10m labels: @@ -180,14 +180,14 @@ spec: rules: - alert: ThanosReceiveHttpRequestErrorRateHigh annotations: - description: Thanos Receive {{`{{`}}$labels.job{{`}}`}} is failing to handle {{`{{`}} $value | humanize {{`}}`}}% of requests. - runbook_url: https://github.com/thanos-io/thanos/tree/master/mixin/runbook.md#alert-name-thanosreceivehttprequesterrorratehigh + description: Thanos Receive {{`{{`}}$labels.job{{`}}`}} is failing to handle {{`{{`}}$value | humanize{{`}}`}}% of requests. + runbook_url: https://github.com/thanos-io/thanos/tree/main/mixin/runbook.md#alert-name-thanosreceivehttprequesterrorratehigh summary: Thanos Receive is failing to handle requests. expr: | ( - sum(rate(http_requests_total{code=~"5..", job=~"thanos-receive.*", handler="receive"}[5m])) + sum by (job) (rate(http_requests_total{code=~"5..", job=~".*thanos-receive.*", handler="receive"}[5m])) / - sum(rate(http_requests_total{job=~"thanos-receive.*", handler="receive"}[5m])) + sum by (job) (rate(http_requests_total{job=~".*thanos-receive.*", handler="receive"}[5m])) ) * 100 > 5 for: 5m labels: @@ -195,36 +195,36 @@ spec: - alert: ThanosReceiveHttpRequestLatencyHigh annotations: description: Thanos Receive {{`{{`}}$labels.job{{`}}`}} has a 99th percentile latency of {{`{{`}} $value {{`}}`}} seconds for requests. - runbook_url: https://github.com/thanos-io/thanos/tree/master/mixin/runbook.md#alert-name-thanosreceivehttprequestlatencyhigh + runbook_url: https://github.com/thanos-io/thanos/tree/main/mixin/runbook.md#alert-name-thanosreceivehttprequestlatencyhigh summary: Thanos Receive has high HTTP requests latency. expr: | ( - histogram_quantile(0.99, sum by (job, le) (rate(http_request_duration_seconds_bucket{job=~"thanos-receive.*", handler="receive"}[5m]))) > 10 + histogram_quantile(0.99, sum by (job, le) (rate(http_request_duration_seconds_bucket{job=~".*thanos-receive.*", handler="receive"}[5m]))) > 10 and - sum by (job) (rate(http_request_duration_seconds_count{job=~"thanos-receive.*", handler="receive"}[5m])) > 0 + sum by (job) (rate(http_request_duration_seconds_count{job=~".*thanos-receive.*", handler="receive"}[5m])) > 0 ) for: 10m labels: severity: critical - alert: ThanosReceiveHighReplicationFailures annotations: - description: Thanos Receive {{`{{`}}$labels.job{{`}}`}} is failing to replicate {{`{{`}} $value | humanize {{`}}`}}% of requests. - runbook_url: https://github.com/thanos-io/thanos/tree/master/mixin/runbook.md#alert-name-thanosreceivehighreplicationfailures + description: Thanos Receive {{`{{`}}$labels.job{{`}}`}} is failing to replicate {{`{{`}}$value | humanize{{`}}`}}% of requests. + runbook_url: https://github.com/thanos-io/thanos/tree/main/mixin/runbook.md#alert-name-thanosreceivehighreplicationfailures summary: Thanos Receive is having high number of replication failures. expr: | thanos_receive_replication_factor > 1 and ( ( - sum by (job) (rate(thanos_receive_replications_total{result="error", job=~"thanos-receive.*"}[5m])) + sum by (job) (rate(thanos_receive_replications_total{result="error", job=~".*thanos-receive.*"}[5m])) / - sum by (job) (rate(thanos_receive_replications_total{job=~"thanos-receive.*"}[5m])) + sum by (job) (rate(thanos_receive_replications_total{job=~".*thanos-receive.*"}[5m])) ) > ( - max by (job) (floor((thanos_receive_replication_factor{job=~"thanos-receive.*"}+1) / 2)) + max by (job) (floor((thanos_receive_replication_factor{job=~".*thanos-receive.*"}+1) / 2)) / - max by (job) (thanos_receive_hashring_nodes{job=~"thanos-receive.*"}) + max by (job) (thanos_receive_hashring_nodes{job=~".*thanos-receive.*"}) ) ) * 100 for: 5m @@ -232,28 +232,28 @@ spec: severity: warning - alert: ThanosReceiveHighForwardRequestFailures annotations: - description: Thanos Receive {{`{{`}}$labels.job{{`}}`}} is failing to forward {{`{{`}} $value | humanize {{`}}`}}% of requests. - runbook_url: https://github.com/thanos-io/thanos/tree/master/mixin/runbook.md#alert-name-thanosreceivehighforwardrequestfailures + description: Thanos Receive {{`{{`}}$labels.job{{`}}`}} is failing to forward {{`{{`}}$value | humanize{{`}}`}}% of requests. + runbook_url: https://github.com/thanos-io/thanos/tree/main/mixin/runbook.md#alert-name-thanosreceivehighforwardrequestfailures summary: Thanos Receive is failing to forward requests. expr: | ( - sum by (job) (rate(thanos_receive_forward_requests_total{result="error", job=~"thanos-receive.*"}[5m])) + sum by (job) (rate(thanos_receive_forward_requests_total{result="error", job=~".*thanos-receive.*"}[5m])) / - sum by (job) (rate(thanos_receive_forward_requests_total{job=~"thanos-receive.*"}[5m])) + sum by (job) (rate(thanos_receive_forward_requests_total{job=~".*thanos-receive.*"}[5m])) ) * 100 > 20 for: 5m labels: - severity: warning + severity: info - alert: ThanosReceiveHighHashringFileRefreshFailures annotations: - description: Thanos Receive {{`{{`}}$labels.job{{`}}`}} is failing to refresh hashring file, {{`{{`}} $value | humanize {{`}}`}} of attempts failed. - runbook_url: https://github.com/thanos-io/thanos/tree/master/mixin/runbook.md#alert-name-thanosreceivehighhashringfilerefreshfailures + description: Thanos Receive {{`{{`}}$labels.job{{`}}`}} is failing to refresh hashring file, {{`{{`}}$value | humanize{{`}}`}} of attempts failed. + runbook_url: https://github.com/thanos-io/thanos/tree/main/mixin/runbook.md#alert-name-thanosreceivehighhashringfilerefreshfailures summary: Thanos Receive is failing to refresh hasring file. expr: | ( - sum by (job) (rate(thanos_receive_hashrings_file_errors_total{job=~"thanos-receive.*"}[5m])) + sum by (job) (rate(thanos_receive_hashrings_file_errors_total{job=~".*thanos-receive.*"}[5m])) / - sum by (job) (rate(thanos_receive_hashrings_file_refreshes_total{job=~"thanos-receive.*"}[5m])) + sum by (job) (rate(thanos_receive_hashrings_file_refreshes_total{job=~".*thanos-receive.*"}[5m])) > 0 ) for: 15m @@ -262,67 +262,81 @@ spec: - alert: ThanosReceiveConfigReloadFailure annotations: description: Thanos Receive {{`{{`}}$labels.job{{`}}`}} has not been able to reload hashring configurations. - runbook_url: https://github.com/thanos-io/thanos/tree/master/mixin/runbook.md#alert-name-thanosreceiveconfigreloadfailure + runbook_url: https://github.com/thanos-io/thanos/tree/main/mixin/runbook.md#alert-name-thanosreceiveconfigreloadfailure summary: Thanos Receive has not been able to reload configuration. - expr: avg(thanos_receive_config_last_reload_successful{job=~"thanos-receive.*"}) by (job) != 1 + expr: avg by (job) (thanos_receive_config_last_reload_successful{job=~".*thanos-receive.*"}) != 1 for: 5m labels: severity: warning - alert: ThanosReceiveNoUpload annotations: - description: Thanos Receive {{`{{`}} $labels.instance {{`}}`}} of {{`{{`}}$labels.job{{`}}`}} has not uploaded latest data to object storage. - runbook_url: https://github.com/thanos-io/thanos/tree/master/mixin/runbook.md#alert-name-thanosreceivenoupload + description: Thanos Receive {{`{{`}}$labels.instance{{`}}`}} has not uploaded latest data to object storage. + runbook_url: https://github.com/thanos-io/thanos/tree/main/mixin/runbook.md#alert-name-thanosreceivenoupload summary: Thanos Receive has not uploaded latest data to object storage. expr: | - (up{job=~"thanos-receive.*"} - 1) - + on (instance) # filters to only alert on current instance last 3h - (sum by (instance) (increase(thanos_shipper_uploads_total{job=~"thanos-receive.*"}[3h])) == 0) + (up{job=~".*thanos-receive.*"} - 1) + + on (job, instance) # filters to only alert on current instance last 3h + (sum by (job, instance) (increase(thanos_shipper_uploads_total{job=~".*thanos-receive.*"}[3h])) == 0) for: 3h labels: severity: critical + - alert: ThanosReceiveTrafficBelowThreshold + annotations: + description: At Thanos Receive {{`{{`}}$labels.job{{`}}`}} in {{`{{`}}$labels.namespace{{`}}`}} , the average 1-hr avg. metrics ingestion rate is {{`{{`}}$value | humanize{{`}}`}}% of 12-hr avg. ingestion rate. + runbook_url: https://github.com/thanos-io/thanos/tree/main/mixin/runbook.md#alert-name-thanosreceivetrafficbelowthreshold + summary: Thanos Receive is experiencing low avg. 1-hr ingestion rate relative to avg. 12-hr ingestion rate. + expr: | + ( + avg_over_time(rate(http_requests_total{job=~".*thanos-receive.*", code=~"2..", handler="receive"}[5m])[1h:5m]) + / + avg_over_time(rate(http_requests_total{job=~".*thanos-receive.*", code=~"2..", handler="receive"}[5m])[12h:5m]) + ) * 100 < 50 + for: 1h + labels: + severity: warning - name: thanos-sidecar rules: - alert: ThanosSidecarPrometheusDown annotations: - description: Thanos Sidecar {{`{{`}}$labels.job{{`}}`}} {{`{{`}}$labels.pod{{`}}`}} cannot connect to Prometheus. - runbook_url: https://github.com/thanos-io/thanos/tree/master/mixin/runbook.md#alert-name-thanossidecarprometheusdown + description: Thanos Sidecar {{`{{`}}$labels.instance{{`}}`}} cannot connect to Prometheus. + runbook_url: https://github.com/thanos-io/thanos/tree/main/mixin/runbook.md#alert-name-thanossidecarprometheusdown summary: Thanos Sidecar cannot connect to Prometheus expr: | - sum by (job, pod) (thanos_sidecar_prometheus_up{job=~"thanos-sidecar.*"} == 0) + thanos_sidecar_prometheus_up{job=~".*thanos-sidecar.*"} == 0 for: 5m labels: severity: critical - alert: ThanosSidecarBucketOperationsFailed annotations: - description: Thanos Sidecar {{`{{`}}$labels.job{{`}}`}} {{`{{`}}$labels.pod{{`}}`}} bucket operations are failing - runbook_url: https://github.com/thanos-io/thanos/tree/master/mixin/runbook.md#alert-name-thanossidecarbucketoperationsfailed + description: Thanos Sidecar {{`{{`}}$labels.instance{{`}}`}} bucket operations are failing + runbook_url: https://github.com/thanos-io/thanos/tree/main/mixin/runbook.md#alert-name-thanossidecarbucketoperationsfailed summary: Thanos Sidecar bucket operations are failing expr: | - rate(thanos_objstore_bucket_operation_failures_total{job=~"thanos-sidecar.*"}[5m]) > 0 + sum by (job, instance) (rate(thanos_objstore_bucket_operation_failures_total{job=~".*thanos-sidecar.*"}[5m])) > 0 for: 5m labels: severity: critical - alert: ThanosSidecarUnhealthy annotations: - description: Thanos Sidecar {{`{{`}}$labels.job{{`}}`}} {{`{{`}}$labels.pod{{`}}`}} is unhealthy for {{`{{`}} $value {{`}}`}} seconds. - runbook_url: https://github.com/thanos-io/thanos/tree/master/mixin/runbook.md#alert-name-thanossidecarunhealthy + description: Thanos Sidecar {{`{{`}}$labels.instance{{`}}`}} is unhealthy for more than {{`{{`}}$value{{`}}`}} seconds. + runbook_url: https://github.com/thanos-io/thanos/tree/main/mixin/runbook.md#alert-name-thanossidecarunhealthy summary: Thanos Sidecar is unhealthy. expr: | - time() - max(thanos_sidecar_last_heartbeat_success_time_seconds{job=~"thanos-sidecar.*"}) by (job, pod) >= 600 + time() - max by (job, instance) (timestamp(thanos_sidecar_last_heartbeat_success_time_seconds{job=~".*thanos-sidecar.*"})) >= 240 labels: severity: critical - name: thanos-store rules: - alert: ThanosStoreGrpcErrorRate annotations: - description: Thanos Store {{`{{`}}$labels.job{{`}}`}} is failing to handle {{`{{`}} $value | humanize {{`}}`}}% of requests. - runbook_url: https://github.com/thanos-io/thanos/tree/master/mixin/runbook.md#alert-name-thanosstoregrpcerrorrate + description: Thanos Store {{`{{`}}$labels.job{{`}}`}} is failing to handle {{`{{`}}$value | humanize{{`}}`}}% of requests. + runbook_url: https://github.com/thanos-io/thanos/tree/main/mixin/runbook.md#alert-name-thanosstoregrpcerrorrate summary: Thanos Store is failing to handle qrpcd requests. expr: | ( - sum by (job) (rate(grpc_server_handled_total{grpc_code=~"Unknown|ResourceExhausted|Internal|Unavailable|DataLoss|DeadlineExceeded", job=~"thanos-store.*"}[5m])) + sum by (job) (rate(grpc_server_handled_total{grpc_code=~"Unknown|ResourceExhausted|Internal|Unavailable|DataLoss|DeadlineExceeded", job=~".*thanos-store.*"}[5m])) / - sum by (job) (rate(grpc_server_started_total{job=~"thanos-store.*"}[5m])) + sum by (job) (rate(grpc_server_started_total{job=~".*thanos-store.*"}[5m])) * 100 > 5 ) for: 5m @@ -330,28 +344,28 @@ spec: severity: warning - alert: ThanosStoreSeriesGateLatencyHigh annotations: - description: Thanos Store {{`{{`}}$labels.job{{`}}`}} has a 99th percentile latency of {{`{{`}} $value {{`}}`}} seconds for store series gate requests. - runbook_url: https://github.com/thanos-io/thanos/tree/master/mixin/runbook.md#alert-name-thanosstoreseriesgatelatencyhigh + description: Thanos Store {{`{{`}}$labels.job{{`}}`}} has a 99th percentile latency of {{`{{`}}$value{{`}}`}} seconds for store series gate requests. + runbook_url: https://github.com/thanos-io/thanos/tree/main/mixin/runbook.md#alert-name-thanosstoreseriesgatelatencyhigh summary: Thanos Store has high latency for store series gate requests. expr: | ( - histogram_quantile(0.99, sum by (job, le) (rate(thanos_bucket_store_series_gate_duration_seconds_bucket{job=~"thanos-store.*"}[5m]))) > 2 + histogram_quantile(0.99, sum by (job, le) (rate(thanos_bucket_store_series_gate_duration_seconds_bucket{job=~".*thanos-store.*"}[5m]))) > 2 and - sum by (job) (rate(thanos_bucket_store_series_gate_duration_seconds_count{job=~"thanos-store.*"}[5m])) > 0 + sum by (job) (rate(thanos_bucket_store_series_gate_duration_seconds_count{job=~".*thanos-store.*"}[5m])) > 0 ) for: 10m labels: severity: warning - alert: ThanosStoreBucketHighOperationFailures annotations: - description: Thanos Store {{`{{`}}$labels.job{{`}}`}} Bucket is failing to execute {{`{{`}} $value | humanize {{`}}`}}% of operations. - runbook_url: https://github.com/thanos-io/thanos/tree/master/mixin/runbook.md#alert-name-thanosstorebuckethighoperationfailures + description: Thanos Store {{`{{`}}$labels.job{{`}}`}} Bucket is failing to execute {{`{{`}}$value | humanize{{`}}`}}% of operations. + runbook_url: https://github.com/thanos-io/thanos/tree/main/mixin/runbook.md#alert-name-thanosstorebuckethighoperationfailures summary: Thanos Store Bucket is failing to execute operations. expr: | ( - sum by (job) (rate(thanos_objstore_bucket_operation_failures_total{job=~"thanos-store.*"}[5m])) + sum by (job) (rate(thanos_objstore_bucket_operation_failures_total{job=~".*thanos-store.*"}[5m])) / - sum by (job) (rate(thanos_objstore_bucket_operations_total{job=~"thanos-store.*"}[5m])) + sum by (job) (rate(thanos_objstore_bucket_operations_total{job=~".*thanos-store.*"}[5m])) * 100 > 5 ) for: 15m @@ -359,14 +373,14 @@ spec: severity: warning - alert: ThanosStoreObjstoreOperationLatencyHigh annotations: - description: Thanos Store {{`{{`}}$labels.job{{`}}`}} Bucket has a 99th percentile latency of {{`{{`}} $value {{`}}`}} seconds for the bucket operations. - runbook_url: https://github.com/thanos-io/thanos/tree/master/mixin/runbook.md#alert-name-thanosstoreobjstoreoperationlatencyhigh + description: Thanos Store {{`{{`}}$labels.job{{`}}`}} Bucket has a 99th percentile latency of {{`{{`}}$value{{`}}`}} seconds for the bucket operations. + runbook_url: https://github.com/thanos-io/thanos/tree/main/mixin/runbook.md#alert-name-thanosstoreobjstoreoperationlatencyhigh summary: Thanos Store is having high latency for bucket operations. expr: | ( - histogram_quantile(0.99, sum by (job, le) (rate(thanos_objstore_bucket_operation_duration_seconds_bucket{job=~"thanos-store.*"}[5m]))) > 2 + histogram_quantile(0.99, sum by (job, le) (rate(thanos_objstore_bucket_operation_duration_seconds_bucket{job=~".*thanos-store.*"}[5m]))) > 2 and - sum by (job) (rate(thanos_objstore_bucket_operation_duration_seconds_count{job=~"thanos-store.*"}[5m])) > 0 + sum by (job) (rate(thanos_objstore_bucket_operation_duration_seconds_count{job=~".*thanos-store.*"}[5m])) > 0 ) for: 10m labels: @@ -375,34 +389,34 @@ spec: rules: - alert: ThanosRuleQueueIsDroppingAlerts annotations: - description: Thanos Rule {{`{{`}}$labels.job{{`}}`}} {{`{{`}}$labels.pod{{`}}`}} is failing to queue alerts. - runbook_url: https://github.com/thanos-io/thanos/tree/master/mixin/runbook.md#alert-name-thanosrulequeueisdroppingalerts + description: Thanos Rule {{`{{`}}$labels.instance{{`}}`}} is failing to queue alerts. + runbook_url: https://github.com/thanos-io/thanos/tree/main/mixin/runbook.md#alert-name-thanosrulequeueisdroppingalerts summary: Thanos Rule is failing to queue alerts. expr: | - sum by (job) (rate(thanos_alert_queue_alerts_dropped_total{job=~"thanos-rule.*"}[5m])) > 0 + sum by (job, instance) (rate(thanos_alert_queue_alerts_dropped_total{job=~".*thanos-rule.*"}[5m])) > 0 for: 5m labels: severity: critical - alert: ThanosRuleSenderIsFailingAlerts annotations: - description: Thanos Rule {{`{{`}}$labels.job{{`}}`}} {{`{{`}}$labels.pod{{`}}`}} is failing to send alerts to alertmanager. - runbook_url: https://github.com/thanos-io/thanos/tree/master/mixin/runbook.md#alert-name-thanosrulesenderisfailingalerts + description: Thanos Rule {{`{{`}}$labels.instance{{`}}`}} is failing to send alerts to alertmanager. + runbook_url: https://github.com/thanos-io/thanos/tree/main/mixin/runbook.md#alert-name-thanosrulesenderisfailingalerts summary: Thanos Rule is failing to send alerts to alertmanager. expr: | - sum by (job) (rate(thanos_alert_sender_alerts_dropped_total{job=~"thanos-rule.*"}[5m])) > 0 + sum by (job, instance) (rate(thanos_alert_sender_alerts_dropped_total{job=~".*thanos-rule.*"}[5m])) > 0 for: 5m labels: severity: critical - alert: ThanosRuleHighRuleEvaluationFailures annotations: - description: Thanos Rule {{`{{`}}$labels.job{{`}}`}} {{`{{`}}$labels.pod{{`}}`}} is failing to evaluate rules. - runbook_url: https://github.com/thanos-io/thanos/tree/master/mixin/runbook.md#alert-name-thanosrulehighruleevaluationfailures + description: Thanos Rule {{`{{`}}$labels.instance{{`}}`}} is failing to evaluate rules. + runbook_url: https://github.com/thanos-io/thanos/tree/main/mixin/runbook.md#alert-name-thanosrulehighruleevaluationfailures summary: Thanos Rule is failing to evaluate rules. expr: | ( - sum by (job) (rate(prometheus_rule_evaluation_failures_total{job=~"thanos-rule.*"}[5m])) + sum by (job, instance) (rate(prometheus_rule_evaluation_failures_total{job=~".*thanos-rule.*"}[5m])) / - sum by (job) (rate(prometheus_rule_evaluations_total{job=~"thanos-rule.*"}[5m])) + sum by (job, instance) (rate(prometheus_rule_evaluations_total{job=~".*thanos-rule.*"}[5m])) * 100 > 5 ) for: 5m @@ -410,38 +424,38 @@ spec: severity: critical - alert: ThanosRuleHighRuleEvaluationWarnings annotations: - description: Thanos Rule {{`{{`}}$labels.job{{`}}`}} {{`{{`}}$labels.pod{{`}}`}} has high number of evaluation warnings. - runbook_url: https://github.com/thanos-io/thanos/tree/master/mixin/runbook.md#alert-name-thanosrulehighruleevaluationwarnings + description: Thanos Rule {{`{{`}}$labels.instance{{`}}`}} has high number of evaluation warnings. + runbook_url: https://github.com/thanos-io/thanos/tree/main/mixin/runbook.md#alert-name-thanosrulehighruleevaluationwarnings summary: Thanos Rule has high number of evaluation warnings. expr: | - sum by (job) (rate(thanos_rule_evaluation_with_warnings_total{job=~"thanos-rule.*"}[5m])) > 0 + sum by (job, instance) (rate(thanos_rule_evaluation_with_warnings_total{job=~".*thanos-rule.*"}[5m])) > 0 for: 15m labels: severity: info - alert: ThanosRuleRuleEvaluationLatencyHigh annotations: - description: Thanos Rule {{`{{`}}$labels.job{{`}}`}}/{{`{{`}}$labels.pod{{`}}`}} has higher evaluation latency than interval for {{`{{`}}$labels.rule_group{{`}}`}}. - runbook_url: https://github.com/thanos-io/thanos/tree/master/mixin/runbook.md#alert-name-thanosruleruleevaluationlatencyhigh + description: Thanos Rule {{`{{`}}$labels.instance{{`}}`}} has higher evaluation latency than interval for {{`{{`}}$labels.rule_group{{`}}`}}. + runbook_url: https://github.com/thanos-io/thanos/tree/main/mixin/runbook.md#alert-name-thanosruleruleevaluationlatencyhigh summary: Thanos Rule has high rule evaluation latency. expr: | ( - sum by (job, pod, rule_group) (prometheus_rule_group_last_duration_seconds{job=~"thanos-rule.*"}) + sum by (job, instance, rule_group) (prometheus_rule_group_last_duration_seconds{job=~".*thanos-rule.*"}) > - sum by (job, pod, rule_group) (prometheus_rule_group_interval_seconds{job=~"thanos-rule.*"}) + sum by (job, instance, rule_group) (prometheus_rule_group_interval_seconds{job=~".*thanos-rule.*"}) ) for: 5m labels: severity: warning - alert: ThanosRuleGrpcErrorRate annotations: - description: Thanos Rule {{`{{`}}$labels.job{{`}}`}} is failing to handle {{`{{`}} $value | humanize {{`}}`}}% of requests. - runbook_url: https://github.com/thanos-io/thanos/tree/master/mixin/runbook.md#alert-name-thanosrulegrpcerrorrate + description: Thanos Rule {{`{{`}}$labels.job{{`}}`}} is failing to handle {{`{{`}}$value | humanize{{`}}`}}% of requests. + runbook_url: https://github.com/thanos-io/thanos/tree/main/mixin/runbook.md#alert-name-thanosrulegrpcerrorrate summary: Thanos Rule is failing to handle grpc requests. expr: | ( - sum by (job) (rate(grpc_server_handled_total{grpc_code=~"Unknown|ResourceExhausted|Internal|Unavailable|DataLoss|DeadlineExceeded", job=~"thanos-rule.*"}[5m])) + sum by (job, instance) (rate(grpc_server_handled_total{grpc_code=~"Unknown|ResourceExhausted|Internal|Unavailable|DataLoss|DeadlineExceeded", job=~".*thanos-rule.*"}[5m])) / - sum by (job) (rate(grpc_server_started_total{job=~"thanos-rule.*"}[5m])) + sum by (job, instance) (rate(grpc_server_started_total{job=~".*thanos-rule.*"}[5m])) * 100 > 5 ) for: 5m @@ -450,22 +464,22 @@ spec: - alert: ThanosRuleConfigReloadFailure annotations: description: Thanos Rule {{`{{`}}$labels.job{{`}}`}} has not been able to reload its configuration. - runbook_url: https://github.com/thanos-io/thanos/tree/master/mixin/runbook.md#alert-name-thanosruleconfigreloadfailure + runbook_url: https://github.com/thanos-io/thanos/tree/main/mixin/runbook.md#alert-name-thanosruleconfigreloadfailure summary: Thanos Rule has not been able to reload configuration. - expr: avg(thanos_rule_config_last_reload_successful{job=~"thanos-rule.*"}) by (job) != 1 + expr: avg by (job, instance) (thanos_rule_config_last_reload_successful{job=~".*thanos-rule.*"}) != 1 for: 5m labels: severity: info - alert: ThanosRuleQueryHighDNSFailures annotations: - description: Thanos Rule {{`{{`}}$labels.job{{`}}`}} has {{`{{`}} $value | humanize {{`}}`}}% of failing DNS queries for query endpoints. - runbook_url: https://github.com/thanos-io/thanos/tree/master/mixin/runbook.md#alert-name-thanosrulequeryhighdnsfailures + description: Thanos Rule {{`{{`}}$labels.job{{`}}`}} has {{`{{`}}$value | humanize{{`}}`}}% of failing DNS queries for query endpoints. + runbook_url: https://github.com/thanos-io/thanos/tree/main/mixin/runbook.md#alert-name-thanosrulequeryhighdnsfailures summary: Thanos Rule is having high number of DNS failures. expr: | ( - sum by (job) (rate(thanos_rule_query_apis_dns_failures_total{job=~"thanos-rule.*"}[5m])) + sum by (job, instance) (rate(thanos_rule_query_apis_dns_failures_total{job=~".*thanos-rule.*"}[5m])) / - sum by (job) (rate(thanos_rule_query_apis_dns_lookups_total{job=~"thanos-rule.*"}[5m])) + sum by (job, instance) (rate(thanos_rule_query_apis_dns_lookups_total{job=~".*thanos-rule.*"}[5m])) * 100 > 1 ) for: 15m @@ -473,14 +487,14 @@ spec: severity: warning - alert: ThanosRuleAlertmanagerHighDNSFailures annotations: - description: Thanos Rule {{`{{`}}$labels.job{{`}}`}} has {{`{{`}} $value | humanize {{`}}`}}% of failing DNS queries for Alertmanager endpoints. - runbook_url: https://github.com/thanos-io/thanos/tree/master/mixin/runbook.md#alert-name-thanosrulealertmanagerhighdnsfailures + description: Thanos Rule {{`{{`}}$labels.instance{{`}}`}} has {{`{{`}}$value | humanize{{`}}`}}% of failing DNS queries for Alertmanager endpoints. + runbook_url: https://github.com/thanos-io/thanos/tree/main/mixin/runbook.md#alert-name-thanosrulealertmanagerhighdnsfailures summary: Thanos Rule is having high number of DNS failures. expr: | ( - sum by (job) (rate(thanos_rule_alertmanagers_dns_failures_total{job=~"thanos-rule.*"}[5m])) + sum by (job, instance) (rate(thanos_rule_alertmanagers_dns_failures_total{job=~".*thanos-rule.*"}[5m])) / - sum by (job) (rate(thanos_rule_alertmanagers_dns_lookups_total{job=~"thanos-rule.*"}[5m])) + sum by (job, instance) (rate(thanos_rule_alertmanagers_dns_lookups_total{job=~".*thanos-rule.*"}[5m])) * 100 > 1 ) for: 15m @@ -488,64 +502,54 @@ spec: severity: warning - alert: ThanosRuleNoEvaluationFor10Intervals annotations: - description: Thanos Rule {{`{{`}}$labels.job{{`}}`}} has {{`{{`}} $value | humanize {{`}}`}}% rule groups that did not evaluate for at least 10x of their expected interval. - runbook_url: https://github.com/thanos-io/thanos/tree/master/mixin/runbook.md#alert-name-thanosrulenoevaluationfor10intervals + description: Thanos Rule {{`{{`}}$labels.job{{`}}`}} has {{`{{`}}$value | humanize{{`}}`}}% rule groups that did not evaluate for at least 10x of their expected interval. + runbook_url: https://github.com/thanos-io/thanos/tree/main/mixin/runbook.md#alert-name-thanosrulenoevaluationfor10intervals summary: Thanos Rule has rule groups that did not evaluate for 10 intervals. expr: | - time() - max by (job, group) (prometheus_rule_group_last_evaluation_timestamp_seconds{job=~"thanos-rule.*"}) + time() - max by (job, instance, group) (prometheus_rule_group_last_evaluation_timestamp_seconds{job=~".*thanos-rule.*"}) > - 10 * max by (job, group) (prometheus_rule_group_interval_seconds{job=~"thanos-rule.*"}) + 10 * max by (job, instance, group) (prometheus_rule_group_interval_seconds{job=~".*thanos-rule.*"}) for: 5m labels: severity: info - alert: ThanosNoRuleEvaluations annotations: - description: Thanos Rule {{`{{`}}$labels.job{{`}}`}} did not perform any rule evaluations in the past 2 minutes. - runbook_url: https://github.com/thanos-io/thanos/tree/master/mixin/runbook.md#alert-name-thanosnoruleevaluations + description: Thanos Rule {{`{{`}}$labels.instance{{`}}`}} did not perform any rule evaluations in the past 10 minutes. + runbook_url: https://github.com/thanos-io/thanos/tree/main/mixin/runbook.md#alert-name-thanosnoruleevaluations summary: Thanos Rule did not perform any rule evaluations. expr: | - sum(rate(prometheus_rule_evaluations_total{job=~"thanos-rule.*"}[2m])) <= 0 + sum by (job, instance) (rate(prometheus_rule_evaluations_total{job=~".*thanos-rule.*"}[5m])) <= 0 and - sum(thanos_rule_loaded_rules{job=~"thanos-rule.*"}) > 0 - for: 3m + sum by (job, instance) (thanos_rule_loaded_rules{job=~".*thanos-rule.*"}) > 0 + for: 5m labels: severity: critical - name: thanos-bucket-replicate rules: - - alert: ThanosBucketReplicateIsDown - annotations: - description: Thanos Replicate has disappeared from Prometheus target discovery. - runbook_url: https://github.com/thanos-io/thanos/tree/master/mixin/runbook.md#alert-name-thanosbucketreplicateisdown - summary: Thanos Replicate has disappeared from Prometheus target discovery. - expr: | - absent(up{job=~"thanos-bucket-replicate.*"}) - for: 5m - labels: - severity: critical - alert: ThanosBucketReplicateErrorRate annotations: - description: Thanos Replicate failing to run, {{`{{`}} $value | humanize {{`}}`}}% of attempts failed. - runbook_url: https://github.com/thanos-io/thanos/tree/master/mixin/runbook.md#alert-name-thanosbucketreplicateerrorrate + description: Thanos Replicate is failing to run, {{`{{`}}$value | humanize{{`}}`}}% of attempts failed. + runbook_url: https://github.com/thanos-io/thanos/tree/main/mixin/runbook.md#alert-name-thanosbucketreplicateerrorrate summary: Thanose Replicate is failing to run. expr: | ( - sum(rate(thanos_replicate_replication_runs_total{result="error", job=~"thanos-bucket-replicate.*"}[5m])) - / on (namespace) group_left - sum(rate(thanos_replicate_replication_runs_total{job=~"thanos-bucket-replicate.*"}[5m])) + sum by (job) (rate(thanos_replicate_replication_runs_total{result="error", job=~".*thanos-bucket-replicate.*"}[5m])) + / on (job) group_left + sum by (job) (rate(thanos_replicate_replication_runs_total{job=~".*thanos-bucket-replicate.*"}[5m])) ) * 100 >= 10 for: 5m labels: severity: critical - alert: ThanosBucketReplicateRunLatency annotations: - description: Thanos Replicate {{`{{`}}$labels.job{{`}}`}} has a 99th percentile latency of {{`{{`}} $value {{`}}`}} seconds for the replicate operations. - runbook_url: https://github.com/thanos-io/thanos/tree/master/mixin/runbook.md#alert-name-thanosbucketreplicaterunlatency + description: Thanos Replicate {{`{{`}}$labels.job{{`}}`}} has a 99th percentile latency of {{`{{`}}$value{{`}}`}} seconds for the replicate operations. + runbook_url: https://github.com/thanos-io/thanos/tree/main/mixin/runbook.md#alert-name-thanosbucketreplicaterunlatency summary: Thanos Replicate has a high latency for replicate operations. expr: | ( - histogram_quantile(0.99, sum by (job, le) (rate(thanos_replicate_replication_run_duration_seconds_bucket{job=~"thanos-bucket-replicate.*"}[5m]))) > 20 + histogram_quantile(0.99, sum by (job) (rate(thanos_replicate_replication_run_duration_seconds_bucket{job=~".*thanos-bucket-replicate.*"}[5m]))) > 20 and - sum by (job) (rate(thanos_replicate_replication_run_duration_seconds_bucket{job=~"thanos-bucket-replicate.*"}[5m])) > 0 + sum by (job) (rate(thanos_replicate_replication_run_duration_seconds_bucket{job=~".*thanos-bucket-replicate.*"}[5m])) > 0 ) for: 5m labels: @@ -554,71 +558,71 @@ spec: rules: - alert: ThanosBucketReplicateIsDown annotations: - description: ThanosBucketReplicate has disappeared from Prometheus target discovery. - runbook_url: https://github.com/thanos-io/thanos/tree/master/mixin/runbook.md#alert-name-thanosbucketreplicateisdown - summary: thanos component has disappeared from Prometheus target discovery. + description: ThanosBucketReplicate has disappeared. Prometheus target for the component cannot be discovered. + runbook_url: https://github.com/thanos-io/thanos/tree/main/mixin/runbook.md#alert-name-thanosbucketreplicateisdown + summary: Thanos component has disappeared. expr: | - absent(up{job=~"thanos-bucket-replicate.*"} == 1) + absent(up{job=~".*thanos-bucket-replicate.*"} == 1) for: 5m labels: severity: critical - alert: ThanosCompactIsDown annotations: - description: ThanosCompact has disappeared from Prometheus target discovery. - runbook_url: https://github.com/thanos-io/thanos/tree/master/mixin/runbook.md#alert-name-thanoscompactisdown - summary: thanos component has disappeared from Prometheus target discovery. + description: ThanosCompact has disappeared. Prometheus target for the component cannot be discovered. + runbook_url: https://github.com/thanos-io/thanos/tree/main/mixin/runbook.md#alert-name-thanoscompactisdown + summary: Thanos component has disappeared. expr: | - absent(up{job=~"thanos-compact.*"} == 1) + absent(up{job=~".*thanos-compact.*"} == 1) for: 5m labels: severity: critical - alert: ThanosQueryIsDown annotations: - description: ThanosQuery has disappeared from Prometheus target discovery. - runbook_url: https://github.com/thanos-io/thanos/tree/master/mixin/runbook.md#alert-name-thanosqueryisdown - summary: thanos component has disappeared from Prometheus target discovery. + description: ThanosQuery has disappeared. Prometheus target for the component cannot be discovered. + runbook_url: https://github.com/thanos-io/thanos/tree/main/mixin/runbook.md#alert-name-thanosqueryisdown + summary: Thanos component has disappeared. expr: | - absent(up{job=~"thanos-query.*"} == 1) + absent(up{job=~".*thanos-query.*"} == 1) for: 5m labels: severity: critical - alert: ThanosReceiveIsDown annotations: - description: ThanosReceive has disappeared from Prometheus target discovery. - runbook_url: https://github.com/thanos-io/thanos/tree/master/mixin/runbook.md#alert-name-thanosreceiveisdown - summary: thanos component has disappeared from Prometheus target discovery. + description: ThanosReceive has disappeared. Prometheus target for the component cannot be discovered. + runbook_url: https://github.com/thanos-io/thanos/tree/main/mixin/runbook.md#alert-name-thanosreceiveisdown + summary: Thanos component has disappeared. expr: | - absent(up{job=~"thanos-receive.*"} == 1) + absent(up{job=~".*thanos-receive.*"} == 1) for: 5m labels: severity: critical - alert: ThanosRuleIsDown annotations: - description: ThanosRule has disappeared from Prometheus target discovery. - runbook_url: https://github.com/thanos-io/thanos/tree/master/mixin/runbook.md#alert-name-thanosruleisdown - summary: thanos component has disappeared from Prometheus target discovery. + description: ThanosRule has disappeared. Prometheus target for the component cannot be discovered. + runbook_url: https://github.com/thanos-io/thanos/tree/main/mixin/runbook.md#alert-name-thanosruleisdown + summary: Thanos component has disappeared. expr: | - absent(up{job=~"thanos-rule.*"} == 1) + absent(up{job=~".*thanos-rule.*"} == 1) for: 5m labels: severity: critical - alert: ThanosSidecarIsDown annotations: - description: ThanosSidecar has disappeared from Prometheus target discovery. - runbook_url: https://github.com/thanos-io/thanos/tree/master/mixin/runbook.md#alert-name-thanossidecarisdown - summary: thanos component has disappeared from Prometheus target discovery. + description: ThanosSidecar has disappeared. Prometheus target for the component cannot be discovered. + runbook_url: https://github.com/thanos-io/thanos/tree/main/mixin/runbook.md#alert-name-thanossidecarisdown + summary: Thanos component has disappeared. expr: | - absent(up{job=~"thanos-sidecar.*"} == 1) + absent(up{job=~".*thanos-sidecar.*"} == 1) for: 5m labels: severity: critical - alert: ThanosStoreIsDown annotations: - description: ThanosStore has disappeared from Prometheus target discovery. - runbook_url: https://github.com/thanos-io/thanos/tree/master/mixin/runbook.md#alert-name-thanosstoreisdown - summary: thanos component has disappeared from Prometheus target discovery. + description: ThanosStore has disappeared. Prometheus target for the component cannot be discovered. + runbook_url: https://github.com/thanos-io/thanos/tree/main/mixin/runbook.md#alert-name-thanosstoreisdown + summary: Thanos component has disappeared. expr: | - absent(up{job=~"thanos-store.*"} == 1) + absent(up{job=~".*thanos-store.*"} == 1) for: 5m labels: severity: critical diff --git a/charts/thanos-mixin/templates/configmap-bucket_replicate.yaml b/charts/thanos-mixin/templates/configmap-bucket_replicate.yaml new file mode 100644 index 00000000..02c7c65b --- /dev/null +++ b/charts/thanos-mixin/templates/configmap-bucket_replicate.yaml @@ -0,0 +1,21 @@ +--- +apiVersion: v1 +kind: ConfigMap +metadata: + name: {{ printf "%s-%s" (include "thanos-mixin.fullname" .) "bucket-replicate" | trunc 63 | trimSuffix "-" }} + namespace: {{ include "thanos-mixin.namespace" . }} + annotations: +{{ include "thanos-mixin.annotations" . | indent 4 }} +{{- if .Values.additionalAnnotations }} +{{ toYaml .Values.additionalAnnotations | indent 4 }} +{{- end }} + labels: + app: {{ include "thanos-mixin.name" . }} +{{ include "thanos-mixin.labels" . | indent 4 }} +{{- if .Values.additionalLabels }} +{{ toYaml .Values.additionalLabels | indent 4 }} +{{- end }} + grafana_dashboard: {{ include "thanos-mixin.name" . }}-bucket-replicate +data: + bucket_replicate.json: |- +{{ .Files.Get "dashboards/bucket_replicate.json" | indent 4}} \ No newline at end of file diff --git a/charts/thanos-mixin/templates/configmap-compact.yaml b/charts/thanos-mixin/templates/configmap-compact.yaml new file mode 100644 index 00000000..2da1acfe --- /dev/null +++ b/charts/thanos-mixin/templates/configmap-compact.yaml @@ -0,0 +1,21 @@ +--- +apiVersion: v1 +kind: ConfigMap +metadata: + name: {{ printf "%s-%s" (include "thanos-mixin.fullname" .) "compact" | trunc 63 | trimSuffix "-" }} + namespace: {{ include "thanos-mixin.namespace" . }} + annotations: +{{ include "thanos-mixin.annotations" . | indent 4 }} +{{- if .Values.additionalAnnotations }} +{{ toYaml .Values.additionalAnnotations | indent 4 }} +{{- end }} + labels: + app: {{ include "thanos-mixin.name" . }} +{{ include "thanos-mixin.labels" . | indent 4 }} +{{- if .Values.additionalLabels }} +{{ toYaml .Values.additionalLabels | indent 4 }} +{{- end }} + grafana_dashboard: {{ include "thanos-mixin.name" . }}-compact +data: + compact.json: |- +{{ .Files.Get "dashboards/compact.json" | indent 4}} \ No newline at end of file diff --git a/charts/thanos-mixin/templates/configmap-overview.yaml b/charts/thanos-mixin/templates/configmap-overview.yaml new file mode 100644 index 00000000..a314f9bd --- /dev/null +++ b/charts/thanos-mixin/templates/configmap-overview.yaml @@ -0,0 +1,21 @@ +--- +apiVersion: v1 +kind: ConfigMap +metadata: + name: {{ printf "%s-%s" (include "thanos-mixin.fullname" .) "overview" | trunc 63 | trimSuffix "-" }} + namespace: {{ include "thanos-mixin.namespace" . }} + annotations: +{{ include "thanos-mixin.annotations" . | indent 4 }} +{{- if .Values.additionalAnnotations }} +{{ toYaml .Values.additionalAnnotations | indent 4 }} +{{- end }} + labels: + app: {{ include "thanos-mixin.name" . }} +{{ include "thanos-mixin.labels" . | indent 4 }} +{{- if .Values.additionalLabels }} +{{ toYaml .Values.additionalLabels | indent 4 }} +{{- end }} + grafana_dashboard: {{ include "thanos-mixin.name" . }}-overview +data: + overview.json: |- +{{ .Files.Get "dashboards/overview.json" | indent 4}} \ No newline at end of file diff --git a/charts/thanos-mixin/templates/configmap-query.yaml b/charts/thanos-mixin/templates/configmap-query.yaml new file mode 100644 index 00000000..a46ee5f5 --- /dev/null +++ b/charts/thanos-mixin/templates/configmap-query.yaml @@ -0,0 +1,21 @@ +--- +apiVersion: v1 +kind: ConfigMap +metadata: + name: {{ printf "%s-%s" (include "thanos-mixin.fullname" .) "query" | trunc 63 | trimSuffix "-" }} + namespace: {{ include "thanos-mixin.namespace" . }} + annotations: +{{ include "thanos-mixin.annotations" . | indent 4 }} +{{- if .Values.additionalAnnotations }} +{{ toYaml .Values.additionalAnnotations | indent 4 }} +{{- end }} + labels: + app: {{ include "thanos-mixin.name" . }} +{{ include "thanos-mixin.labels" . | indent 4 }} +{{- if .Values.additionalLabels }} +{{ toYaml .Values.additionalLabels | indent 4 }} +{{- end }} + grafana_dashboard: {{ include "thanos-mixin.name" . }}-query +data: + query.json: |- +{{ .Files.Get "dashboards/query.json" | indent 4}} \ No newline at end of file diff --git a/charts/thanos-mixin/templates/configmap-receive.yaml b/charts/thanos-mixin/templates/configmap-receive.yaml new file mode 100644 index 00000000..34dbb5d1 --- /dev/null +++ b/charts/thanos-mixin/templates/configmap-receive.yaml @@ -0,0 +1,21 @@ +--- +apiVersion: v1 +kind: ConfigMap +metadata: + name: {{ printf "%s-%s" (include "thanos-mixin.fullname" .) "receive" | trunc 63 | trimSuffix "-" }} + namespace: {{ include "thanos-mixin.namespace" . }} + annotations: +{{ include "thanos-mixin.annotations" . | indent 4 }} +{{- if .Values.additionalAnnotations }} +{{ toYaml .Values.additionalAnnotations | indent 4 }} +{{- end }} + labels: + app: {{ include "thanos-mixin.name" . }} +{{ include "thanos-mixin.labels" . | indent 4 }} +{{- if .Values.additionalLabels }} +{{ toYaml .Values.additionalLabels | indent 4 }} +{{- end }} + grafana_dashboard: {{ include "thanos-mixin.name" . }}-receive +data: + receive.json: |- +{{ .Files.Get "dashboards/receive.json" | indent 4}} \ No newline at end of file diff --git a/charts/thanos-mixin/templates/configmap-rule.yaml b/charts/thanos-mixin/templates/configmap-rule.yaml new file mode 100644 index 00000000..1176cc0e --- /dev/null +++ b/charts/thanos-mixin/templates/configmap-rule.yaml @@ -0,0 +1,21 @@ +--- +apiVersion: v1 +kind: ConfigMap +metadata: + name: {{ printf "%s-%s" (include "thanos-mixin.fullname" .) "rule" | trunc 63 | trimSuffix "-" }} + namespace: {{ include "thanos-mixin.namespace" . }} + annotations: +{{ include "thanos-mixin.annotations" . | indent 4 }} +{{- if .Values.additionalAnnotations }} +{{ toYaml .Values.additionalAnnotations | indent 4 }} +{{- end }} + labels: + app: {{ include "thanos-mixin.name" . }} +{{ include "thanos-mixin.labels" . | indent 4 }} +{{- if .Values.additionalLabels }} +{{ toYaml .Values.additionalLabels | indent 4 }} +{{- end }} + grafana_dashboard: {{ include "thanos-mixin.name" . }}-rule +data: + rule.json: |- +{{ .Files.Get "dashboards/rule.json" | indent 4}} \ No newline at end of file diff --git a/charts/thanos-mixin/templates/configmap-sidecar.yaml b/charts/thanos-mixin/templates/configmap-sidecar.yaml new file mode 100644 index 00000000..96acdd6f --- /dev/null +++ b/charts/thanos-mixin/templates/configmap-sidecar.yaml @@ -0,0 +1,21 @@ +--- +apiVersion: v1 +kind: ConfigMap +metadata: + name: {{ printf "%s-%s" (include "thanos-mixin.fullname" .) "sidecar" | trunc 63 | trimSuffix "-" }} + namespace: {{ include "thanos-mixin.namespace" . }} + annotations: +{{ include "thanos-mixin.annotations" . | indent 4 }} +{{- if .Values.additionalAnnotations }} +{{ toYaml .Values.additionalAnnotations | indent 4 }} +{{- end }} + labels: + app: {{ include "thanos-mixin.name" . }} +{{ include "thanos-mixin.labels" . | indent 4 }} +{{- if .Values.additionalLabels }} +{{ toYaml .Values.additionalLabels | indent 4 }} +{{- end }} + grafana_dashboard: {{ include "thanos-mixin.name" . }}-sidecar +data: + sidecar.json: |- +{{ .Files.Get "dashboards/sidecar.json" | indent 4}} \ No newline at end of file diff --git a/charts/thanos-mixin/templates/configmap-store.yaml b/charts/thanos-mixin/templates/configmap-store.yaml new file mode 100644 index 00000000..25a73b14 --- /dev/null +++ b/charts/thanos-mixin/templates/configmap-store.yaml @@ -0,0 +1,21 @@ +--- +apiVersion: v1 +kind: ConfigMap +metadata: + name: {{ printf "%s-%s" (include "thanos-mixin.fullname" .) "store" | trunc 63 | trimSuffix "-" }} + namespace: {{ include "thanos-mixin.namespace" . }} + annotations: +{{ include "thanos-mixin.annotations" . | indent 4 }} +{{- if .Values.additionalAnnotations }} +{{ toYaml .Values.additionalAnnotations | indent 4 }} +{{- end }} + labels: + app: {{ include "thanos-mixin.name" . }} +{{ include "thanos-mixin.labels" . | indent 4 }} +{{- if .Values.additionalLabels }} +{{ toYaml .Values.additionalLabels | indent 4 }} +{{- end }} + grafana_dashboard: {{ include "thanos-mixin.name" . }}-store +data: + store.json: |- +{{ .Files.Get "dashboards/store.json" | indent 4}} \ No newline at end of file diff --git a/charts/thanos-mixin/templates/rules.yaml b/charts/thanos-mixin/templates/rules.yaml index 20a2aa7e..48abba83 100644 --- a/charts/thanos-mixin/templates/rules.yaml +++ b/charts/thanos-mixin/templates/rules.yaml @@ -2,7 +2,7 @@ apiVersion: monitoring.coreos.com/v1 kind: PrometheusRule metadata: - name: {{ printf "%s-%s" (include "thanos-mixin.fullname" .) "rules" | trunc 63 | trimSuffix "-" }}-rules + name: {{ printf "%s-%s" (include "thanos-mixin.fullname" .) "rules" | trunc 63 | trimSuffix "-" }} namespace: {{ include "thanos-mixin.namespace" . }} annotations: {{ include "thanos-mixin.annotations" . | indent 4 }} @@ -21,35 +21,35 @@ spec: rules: - expr: | ( - sum(rate(grpc_client_handled_total{grpc_code=~"Unknown|ResourceExhausted|Internal|Unavailable|DataLoss|DeadlineExceeded", job=~"thanos-query.*", grpc_type="unary"}[5m])) + sum by (job) (rate(grpc_client_handled_total{grpc_code=~"Unknown|ResourceExhausted|Internal|Unavailable|DataLoss|DeadlineExceeded", job=~".*thanos-query.*", grpc_type="unary"}[5m])) / - sum(rate(grpc_client_started_total{job=~"thanos-query.*", grpc_type="unary"}[5m])) + sum by (job) (rate(grpc_client_started_total{job=~".*thanos-query.*", grpc_type="unary"}[5m])) ) record: :grpc_client_failures_per_unary:sum_rate - expr: | ( - sum(rate(grpc_client_handled_total{grpc_code=~"Unknown|ResourceExhausted|Internal|Unavailable|DataLoss|DeadlineExceeded", job=~"thanos-query.*", grpc_type="server_stream"}[5m])) + sum by (job) (rate(grpc_client_handled_total{grpc_code=~"Unknown|ResourceExhausted|Internal|Unavailable|DataLoss|DeadlineExceeded", job=~".*thanos-query.*", grpc_type="server_stream"}[5m])) / - sum(rate(grpc_client_started_total{job=~"thanos-query.*", grpc_type="server_stream"}[5m])) + sum by (job) (rate(grpc_client_started_total{job=~".*thanos-query.*", grpc_type="server_stream"}[5m])) ) record: :grpc_client_failures_per_stream:sum_rate - expr: | ( - sum(rate(thanos_query_store_apis_dns_failures_total{job=~"thanos-query.*"}[5m])) + sum by (job) (rate(thanos_query_store_apis_dns_failures_total{job=~".*thanos-query.*"}[5m])) / - sum(rate(thanos_query_store_apis_dns_lookups_total{job=~"thanos-query.*"}[5m])) + sum by (job) (rate(thanos_query_store_apis_dns_lookups_total{job=~".*thanos-query.*"}[5m])) ) record: :thanos_query_store_apis_dns_failures_per_lookup:sum_rate - expr: | histogram_quantile(0.99, - sum(rate(http_request_duration_seconds_bucket{job=~"thanos-query.*", handler="query"}[5m])) by (le) + sum by (job, le) (rate(http_request_duration_seconds_bucket{job=~".*thanos-query.*", handler="query"}[5m])) ) labels: quantile: "0.99" record: :query_duration_seconds:histogram_quantile - expr: | histogram_quantile(0.99, - sum(rate(http_request_duration_seconds_bucket{job=~"thanos-query.*", handler="query_range"}[5m])) by (le) + sum by (job, le) (rate(http_request_duration_seconds_bucket{job=~".*thanos-query.*", handler="query_range"}[5m])) ) labels: quantile: "0.99" @@ -57,80 +57,80 @@ spec: - name: thanos-receive.rules rules: - expr: | - sum( - rate(grpc_server_handled_total{grpc_code=~"Unknown|ResourceExhausted|Internal|Unavailable|DataLoss|DeadlineExceeded", job=~"thanos-receive.*", grpc_type="unary"}[5m]) + ( + sum by (job) (rate(grpc_server_handled_total{grpc_code=~"Unknown|ResourceExhausted|Internal|Unavailable|DataLoss|DeadlineExceeded", job=~".*thanos-receive.*", grpc_type="unary"}[5m])) / - rate(grpc_server_started_total{job=~"thanos-receive.*", grpc_type="unary"}[5m]) + sum by (job) (rate(grpc_server_started_total{job=~".*thanos-receive.*", grpc_type="unary"}[5m])) ) record: :grpc_server_failures_per_unary:sum_rate - expr: | - sum( - rate(grpc_server_handled_total{grpc_code=~"Unknown|ResourceExhausted|Internal|Unavailable|DataLoss|DeadlineExceeded", job=~"thanos-receive.*", grpc_type="server_stream"}[5m]) + ( + sum by (job) (rate(grpc_server_handled_total{grpc_code=~"Unknown|ResourceExhausted|Internal|Unavailable|DataLoss|DeadlineExceeded", job=~".*thanos-receive.*", grpc_type="server_stream"}[5m])) / - rate(grpc_server_started_total{job=~"thanos-receive.*", grpc_type="server_stream"}[5m]) + sum by (job) (rate(grpc_server_started_total{job=~".*thanos-receive.*", grpc_type="server_stream"}[5m])) ) record: :grpc_server_failures_per_stream:sum_rate - expr: | - sum( - rate(http_requests_total{handler="receive", job=~"thanos-receive.*", code!~"5.."}[5m]) + ( + sum by (job) (rate(http_requests_total{handler="receive", job=~".*thanos-receive.*", code!~"5.."}[5m])) / - rate(http_requests_total{handler="receive", job=~"thanos-receive.*"}[5m]) + sum by (job) (rate(http_requests_total{handler="receive", job=~".*thanos-receive.*"}[5m])) ) record: :http_failure_per_request:sum_rate - expr: | histogram_quantile(0.99, - sum(rate(http_request_duration_seconds_bucket{handler="receive", job=~"thanos-receive.*"}[5m])) by (le) + sum by (job, le) (rate(http_request_duration_seconds_bucket{handler="receive", job=~".*thanos-receive.*"}[5m])) ) labels: quantile: "0.99" record: :http_request_duration_seconds:histogram_quantile - expr: | ( - sum(rate(thanos_receive_replications_total{result="error", job=~"thanos-receive.*"}[5m])) + sum by (job) (rate(thanos_receive_replications_total{result="error", job=~".*thanos-receive.*"}[5m])) / - sum(rate(thanos_receive_replications_total{job=~"thanos-receive.*"}[5m])) + sum by (job) (rate(thanos_receive_replications_total{job=~".*thanos-receive.*"}[5m])) ) record: :thanos_receive_replication_failure_per_requests:sum_rate - expr: | ( - sum(rate(thanos_receive_forward_requests_total{result="error", job=~"thanos-receive.*"}[5m])) + sum by (job) (rate(thanos_receive_forward_requests_total{result="error", job=~".*thanos-receive.*"}[5m])) / - sum(rate(thanos_receive_forward_requests_total{job=~"thanos-receive.*"}[5m])) + sum by (job) (rate(thanos_receive_forward_requests_total{job=~".*thanos-receive.*"}[5m])) ) record: :thanos_receive_forward_failure_per_requests:sum_rate - expr: | ( - sum(rate(thanos_receive_hashrings_file_errors_total{job=~"thanos-receive.*"}[5m])) + sum by (job) (rate(thanos_receive_hashrings_file_errors_total{job=~".*thanos-receive.*"}[5m])) / - sum(rate(thanos_receive_hashrings_file_refreshes_total{job=~"thanos-receive.*"}[5m])) + sum by (job) (rate(thanos_receive_hashrings_file_refreshes_total{job=~".*thanos-receive.*"}[5m])) ) record: :thanos_receive_hashring_file_failure_per_refresh:sum_rate - name: thanos-store.rules rules: - expr: | ( - sum(rate(grpc_server_handled_total{grpc_code=~"Unknown|ResourceExhausted|Internal|Unavailable|DataLoss|DeadlineExceeded", job=~"thanos-store.*", grpc_type="unary"}[5m])) + sum by (job) (rate(grpc_server_handled_total{grpc_code=~"Unknown|ResourceExhausted|Internal|Unavailable|DataLoss|DeadlineExceeded", job=~".*thanos-store.*", grpc_type="unary"}[5m])) / - sum(rate(grpc_server_started_total{job=~"thanos-store.*", grpc_type="unary"}[5m])) + sum by (job) (rate(grpc_server_started_total{job=~".*thanos-store.*", grpc_type="unary"}[5m])) ) record: :grpc_server_failures_per_unary:sum_rate - expr: | ( - sum(rate(grpc_server_handled_total{grpc_code=~"Unknown|ResourceExhausted|Internal|Unavailable|DataLoss|DeadlineExceeded", job=~"thanos-store.*", grpc_type="server_stream"}[5m])) + sum by (job) (rate(grpc_server_handled_total{grpc_code=~"Unknown|ResourceExhausted|Internal|Unavailable|DataLoss|DeadlineExceeded", job=~".*thanos-store.*", grpc_type="server_stream"}[5m])) / - sum(rate(grpc_server_started_total{job=~"thanos-store.*", grpc_type="server_stream"}[5m])) + sum by (job) (rate(grpc_server_started_total{job=~".*thanos-store.*", grpc_type="server_stream"}[5m])) ) record: :grpc_server_failures_per_stream:sum_rate - expr: | ( - sum(rate(thanos_objstore_bucket_operation_failures_total{job=~"thanos-store.*"}[5m])) + sum by (job) (rate(thanos_objstore_bucket_operation_failures_total{job=~".*thanos-store.*"}[5m])) / - sum(rate(thanos_objstore_bucket_operations_total{job=~"thanos-store.*"}[5m])) + sum by (job) (rate(thanos_objstore_bucket_operations_total{job=~".*thanos-store.*"}[5m])) ) record: :thanos_objstore_bucket_failures_per_operation:sum_rate - expr: | histogram_quantile(0.99, - sum(rate(thanos_objstore_bucket_operation_duration_seconds_bucket{job=~"thanos-store.*"}[5m])) by (le) + sum by (job, le) (rate(thanos_objstore_bucket_operation_duration_seconds_bucket{job=~".*thanos-store.*"}[5m])) ) labels: quantile: "0.99" diff --git a/hack/mixins.py b/hack/mixins.py index 43f4e1f5..726d7fa6 100755 --- a/hack/mixins.py +++ b/hack/mixins.py @@ -54,39 +54,76 @@ def download(url, filename): f.write(r.content) +def template(f, chart_dst, mixin_header): + orig = "%s" % f + filename = path.basename(orig) + + dest = "%s/%s" % (chart_dst, filename) + if os.path.exists(dest): + os.remove(dest) + logger.info("Copy %s %s", orig, dest) + + with open(dest, "w") as file: + # header = fileinput.input(mixin_header) + # file.writelines(header) + + header = open(mixin_header, "rt") + for line in header: + file.write(line.replace("__name__", filename.replace(".yaml", ""))) + + fin = open(orig, "rt") + for line in fin: + file.write(" %s" % escape(line)) + + +def template_configmap(f, chart_dst, mixin_header): + orig = "%s" % f + filename = path.basename(orig) + + dashboard = "%s/dashboards/%s" % (chart_dst, filename) + if os.path.exists(dashboard): + os.remove(dashboard) + logger.info("Copy %s => %s", orig, dashboard) + shutil.copy(orig, dashboard) + + dest = "%s/templates/configmap-%s" % (chart_dst, filename.replace(".json", ".yaml")) + if os.path.exists(dest): + os.remove(dest) + logger.info("Manage %s => %s", orig, dest) + + with open(dest, "w") as file: + header = open(mixin_header, "rt") + for line in header: + file.write( + line.replace( + "__name__", filename.replace(".json", "").replace("_", "-") + ) + ) + file.write(" %s: |-\n" % filename) + file.write('{{ .Files.Get "dashboards/%s" | indent 4}}' % filename) + + def manage_mixin(mixin_directory, mixin): logger.info("Manage %s", mixin) - mixin_header = "%s/%s.tpl" % (os.path.dirname(__file__), mixin) - if not os.path.exists(mixin_header): - logger.warning("Header not found: %s", mixin_header) - return - - chart_dst = "charts/%s/templates" % mixin + chart_dst = "charts/%s" % mixin if not os.path.exists(chart_dst): logger.warning("Mixin chart not found: %s", chart_dst) return - files = glob.glob("%s/%s/prometheus/*.yaml" % (mixin_directory, mixin)) - for f in files: - orig = "%s" % f - dest = "%s/%s" % (chart_dst, path.basename(orig)) - if os.path.exists(dest): - os.remove(dest) - logger.info("Copy %s %s", orig, dest) + prom_header = "%s/%s.tpl" % (os.path.dirname(__file__), mixin) + if not os.path.exists(prom_header): + logger.warning("Header not found: %s", prom_header) + return + for f in glob.glob("%s/%s/prometheus/*.yaml" % (mixin_directory, mixin)): + template(f, "%s/templates" % chart_dst, prom_header) - fin = open(orig, "rt") - # fout = open(dest, "w") - # for line in fin: - # fout.write(escape(line)) - - with open(dest, "w") as file: - header = fileinput.input(mixin_header) - file.writelines(header) - # input_lines = fileinput.input(orig) - # file.writelines(input_lines) - for line in fin: - file.write(escape(line)) + dashboard_header = "%s/%s_dashboard.tpl" % (os.path.dirname(__file__), mixin) + if not os.path.exists(dashboard_header): + logger.warning("Header not found: %s", dashboard_header) + return + for f in glob.glob("%s/%s/dashboards/*.json" % (mixin_directory, mixin)): + template_configmap(f, chart_dst, dashboard_header) def main(url, filename, mixin_directory, chart): diff --git a/hack/thanos-mixin.tpl b/hack/thanos-mixin.tpl new file mode 100644 index 00000000..b88cb5cf --- /dev/null +++ b/hack/thanos-mixin.tpl @@ -0,0 +1,18 @@ +--- +apiVersion: monitoring.coreos.com/v1 +kind: PrometheusRule +metadata: + name: {{ printf "%s-%s" (include "thanos-mixin.fullname" .) "__name__" | trunc 63 | trimSuffix "-" }} + namespace: {{ include "thanos-mixin.namespace" . }} + annotations: +{{ include "thanos-mixin.annotations" . | indent 4 }} +{{- if .Values.additionalAnnotations }} +{{ toYaml .Values.additionalAnnotations | indent 4 }} +{{- end }} + labels: + app: {{ include "thanos-mixin.name" . }} +{{ include "thanos-mixin.labels" . | indent 4 }} +{{- if .Values.additionalLabels }} +{{ toYaml .Values.additionalLabels | indent 4 }} +{{- end }} +spec: diff --git a/hack/thanos-mixin_dashboard.tpl b/hack/thanos-mixin_dashboard.tpl new file mode 100644 index 00000000..5e78849e --- /dev/null +++ b/hack/thanos-mixin_dashboard.tpl @@ -0,0 +1,19 @@ +--- +apiVersion: v1 +kind: ConfigMap +metadata: + name: {{ printf "%s-%s" (include "thanos-mixin.fullname" .) "__name__" | trunc 63 | trimSuffix "-" }} + namespace: {{ include "thanos-mixin.namespace" . }} + annotations: +{{ include "thanos-mixin.annotations" . | indent 4 }} +{{- if .Values.additionalAnnotations }} +{{ toYaml .Values.additionalAnnotations | indent 4 }} +{{- end }} + labels: + app: {{ include "thanos-mixin.name" . }} +{{ include "thanos-mixin.labels" . | indent 4 }} +{{- if .Values.additionalLabels }} +{{ toYaml .Values.additionalLabels | indent 4 }} +{{- end }} + grafana_dashboard: {{ include "thanos-mixin.name" . }}-__name__ +data: