Skip to content

Commit

Permalink
[alerting] Add severity and refactor code (#569)
Browse files Browse the repository at this point in the history
* Move AlertInfo to its own package so that it can shared across notifier implementations.
* Remove "json" from alert fields. It's not useful for most of the notifier types.
* Add "severity" to alert configuration. This will be included in the alert details and will be used for PagerDuty events. 
* Add "other_info" to include additional information about alerts. This information can include fields like: team, additional dashboards, etc.
* Include all fields as details in the Slack and Email messages. PagerDuty already included all fields in custom details field.
  • Loading branch information
manugarg committed Oct 17, 2023
1 parent 47da354 commit b88ccfa
Show file tree
Hide file tree
Showing 18 changed files with 592 additions and 216 deletions.
94 changes: 94 additions & 0 deletions probes/alerting/alertinfo/alertinfo.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,94 @@
// Copyright 2023 The Cloudprober Authors.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

// Package alertinfo implements AlertInfo struct for sharing alert data
// across modules.
package alertinfo

import (
"sort"
"strconv"
"strings"
"time"

"github.com/cloudprober/cloudprober/common/strtemplate"
"github.com/cloudprober/cloudprober/targets/endpoint"
)

// AlertInfo contains information about an alert.
type AlertInfo struct {
Name string
ProbeName string
ConditionID string
Target endpoint.Endpoint
Failures int
Total int
FailingSince time.Time
}

func (ai *AlertInfo) Fields(templateDetails map[string]string) map[string]string {
fields := map[string]string{
"alert": ai.Name,
"probe": ai.ProbeName,
"target": ai.Target.Dst(),
"condition_id": ai.ConditionID,
"failures": strconv.Itoa(ai.Failures),
"total": strconv.Itoa(ai.Total),
"since": ai.FailingSince.Format(time.RFC3339),
}

for k, v := range ai.Target.Labels {
fields["target.label."+k] = v
}

if ai.Target.IP != nil {
fields["target_ip"] = ai.Target.IP.String()
}

// Note that we parse details in the end, that's because details template
// may use other parsed fields like dashboard_url, playbook_url, etc.
for k, v := range templateDetails {
if k != "details" {
fields[k], _ = strtemplate.SubstituteLabels(v, fields)
}
}
if templateDetails["details"] != "" {
fields["details"], _ = strtemplate.SubstituteLabels(templateDetails["details"], fields)
}

return fields
}

func FieldsToString(fields map[string]string, skipKeys ...string) string {
skipMap := make(map[string]bool)
for _, k := range skipKeys {
skipMap[k] = true
}

var keys []string
for k := range fields {
if !skipMap[k] {
keys = append(keys, k)
}
}

sort.Strings(keys)

var out []string
for _, k := range keys {
out = append(out, k+": "+fields[k])
}

return strings.Join(out, "\n")
}
135 changes: 135 additions & 0 deletions probes/alerting/alertinfo/alertinfo_test.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,135 @@
// Copyright 2023 The Cloudprober Authors.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

// Package alertinfo implements AlertInfo struct for sharing alert data
// across modules.
package alertinfo

import (
"net"
"testing"
"time"

"github.com/cloudprober/cloudprober/targets/endpoint"
"github.com/stretchr/testify/assert"
)

func TestAlertInfoFields(t *testing.T) {
testTarget := endpoint.Endpoint{
Name: "test-target",
IP: net.ParseIP("10.11.12.13"),
Labels: map[string]string{
"apptype": "backend",
"language": "go",
},
}

tests := []struct {
name string
ai *AlertInfo
templateDetails map[string]string
want map[string]string
}{
{
name: "no_template_details",
ai: &AlertInfo{
Name: "test-alert",
ProbeName: "test-probe",
ConditionID: "122333444",
Target: testTarget,
Failures: 8,
Total: 12,
FailingSince: time.Time{}.Add(time.Second),
},
want: map[string]string{
"alert": "test-alert",
"probe": "test-probe",
"condition_id": "122333444",
"target": "test-target",
"target_ip": "10.11.12.13",
"failures": "8",
"total": "12",
"since": "0001-01-01T00:00:01Z",
"target.label.apptype": "backend",
"target.label.language": "go",
},
},
{
name: "with_template_details",
ai: &AlertInfo{
Name: "test-alert",
ProbeName: "test-probe",
ConditionID: "122333444",
Target: testTarget,
Failures: 8,
Total: 12,
FailingSince: time.Time{}.Add(time.Second),
},
templateDetails: map[string]string{
"summary": "Cloudprober alert \"@alert@\" for \"@target@\"",
"dashboard_url": "https://my-dashboard.com/probe=@probe@&target=@target@",
"details": "Dashboard: @dashboard_url@",
},
want: map[string]string{
"alert": "test-alert",
"probe": "test-probe",
"condition_id": "122333444",
"target": "test-target",
"target_ip": "10.11.12.13",
"failures": "8",
"total": "12",
"since": "0001-01-01T00:00:01Z",
"target.label.apptype": "backend",
"target.label.language": "go",
"summary": "Cloudprober alert \"test-alert\" for \"test-target\"",
"dashboard_url": "https://my-dashboard.com/probe=test-probe&target=test-target",
"details": "Dashboard: https://my-dashboard.com/probe=test-probe&target=test-target",
},
},
}
for _, tt := range tests {
t.Run(tt.name, func(t *testing.T) {
assert.Equal(t, tt.want, tt.ai.Fields(tt.templateDetails), "Fields don't match")
})
}
}

func TestFieldsToString(t *testing.T) {
fields := map[string]string{
"alert": "test-alert",
"probe": "test-probe",
"condition_id": "122333444",
}

tests := []struct {
name string
skipKeys []string
want string
}{
{
name: "skip_none",
want: "alert: test-alert\ncondition_id: 122333444\nprobe: test-probe",
},
{
name: "skip_probe",
skipKeys: []string{"condition_id"},
want: "alert: test-alert\nprobe: test-probe",
},
}
for _, tt := range tests {
t.Run(tt.name, func(t *testing.T) {
assert.Equal(t, tt.want, FieldsToString(fields, tt.skipKeys...), "Fields don't match")
})
}
}
5 changes: 3 additions & 2 deletions probes/alerting/alerting.go
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,7 @@ import (

"github.com/cloudprober/cloudprober/logger"
"github.com/cloudprober/cloudprober/metrics"
"github.com/cloudprober/cloudprober/probes/alerting/alertinfo"
"github.com/cloudprober/cloudprober/probes/alerting/notifier"
configpb "github.com/cloudprober/cloudprober/probes/alerting/proto"
"github.com/cloudprober/cloudprober/targets/endpoint"
Expand All @@ -50,7 +51,7 @@ type AlertHandler struct {
probeName string
condition *configpb.Condition
notifyConfig *configpb.NotifyConfig
notifyCh chan *notifier.AlertInfo // Used only for testing for now.
notifyCh chan *alertinfo.AlertInfo // Used only for testing for now.
notifier *notifier.Notifier

mu sync.Mutex
Expand Down Expand Up @@ -125,7 +126,7 @@ func (ah *AlertHandler) notify(ep endpoint.Endpoint, ts *targetState, totalFailu
ah.l.Warningf("ALERT (%s): target (%s), failures (%d) higher than (%d) since (%v)", ah.name, ep.Name, totalFailures, ah.condition.Failures, ts.failingSince)

ts.alerted = true
alertInfo := &notifier.AlertInfo{
alertInfo := &alertinfo.AlertInfo{
Name: ah.name,
ProbeName: ah.probeName,
ConditionID: ts.conditionID,
Expand Down
20 changes: 10 additions & 10 deletions probes/alerting/alerting_status.go
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,7 @@ import (
"sync"
"time"

"github.com/cloudprober/cloudprober/probes/alerting/notifier"
"github.com/cloudprober/cloudprober/probes/alerting/alertinfo"
)

var statusTmpl = template.Must(template.New("status").Parse(`
Expand Down Expand Up @@ -78,29 +78,29 @@ var statusTmpl = template.Must(template.New("status").Parse(`
// resolvedAlert is used to keep track of resolved alerts, to be able to show
// alerts hitory on the alerts dashboard.
type resolvedAlert struct {
AlertInfo *notifier.AlertInfo
AlertInfo *alertinfo.AlertInfo
ResolvedAt time.Time
}

var maxAlertsHistory = 20

type state struct {
mu sync.RWMutex
currentAlerts map[string]*notifier.AlertInfo
currentAlerts map[string]*alertinfo.AlertInfo
resolvedAlerts []resolvedAlert
}

func (st *state) get(key string) *notifier.AlertInfo {
func (st *state) get(key string) *alertinfo.AlertInfo {
st.mu.RLock()
defer st.mu.RUnlock()
return st.currentAlerts[key]
}

func (st *state) add(key string, ai *notifier.AlertInfo) {
func (st *state) add(key string, ai *alertinfo.AlertInfo) {
st.mu.Lock()
defer st.mu.Unlock()
if st.currentAlerts == nil {
st.currentAlerts = make(map[string]*notifier.AlertInfo)
st.currentAlerts = make(map[string]*alertinfo.AlertInfo)
}
st.currentAlerts[key] = ai
}
Expand All @@ -117,11 +117,11 @@ func (st *state) resolve(key string) {
delete(st.currentAlerts, key)
}

func (st *state) list() ([]*notifier.AlertInfo, []resolvedAlert) {
func (st *state) list() ([]*alertinfo.AlertInfo, []resolvedAlert) {
st.mu.RLock()
defer st.mu.RUnlock()

var currentAlerts []*notifier.AlertInfo
var currentAlerts []*alertinfo.AlertInfo
for _, ai := range st.currentAlerts {
currentAlerts = append(currentAlerts, ai)
}
Expand All @@ -145,7 +145,7 @@ func (st *state) statusHTML() (string, error) {
}

err := statusTmpl.Execute(&statusBuf, struct {
CurrentAlerts []*notifier.AlertInfo
CurrentAlerts []*alertinfo.AlertInfo
PreviousAlerts []resolvedAlert
}{
CurrentAlerts: currentAlerts, PreviousAlerts: previousAlerts,
Expand All @@ -155,7 +155,7 @@ func (st *state) statusHTML() (string, error) {
}

var globalState = state{
currentAlerts: make(map[string]*notifier.AlertInfo),
currentAlerts: make(map[string]*alertinfo.AlertInfo),
}

func StatusHTML() (string, error) {
Expand Down

0 comments on commit b88ccfa

Please sign in to comment.