diff --git a/etcdserver/api/etcdhttp/metrics.go b/etcdserver/api/etcdhttp/metrics.go index ff16d81062f..ef3d9544e10 100644 --- a/etcdserver/api/etcdhttp/metrics.go +++ b/etcdserver/api/etcdhttp/metrics.go @@ -36,7 +36,7 @@ const ( // HandleMetricsHealth registers metrics and health handlers. func HandleMetricsHealth(mux *http.ServeMux, srv etcdserver.ServerV2) { mux.Handle(PathMetrics, promhttp.Handler()) - mux.Handle(PathHealth, NewHealthHandler(func() Health { return checkHealth(srv) })) + mux.Handle(PathHealth, NewHealthHandler(func(excludedAlarms AlarmSet) Health { return checkHealth(srv, excludedAlarms) })) } // HandlePrometheus registers prometheus handler on '/metrics'. @@ -45,7 +45,7 @@ func HandlePrometheus(mux *http.ServeMux) { } // NewHealthHandler handles '/health' requests. -func NewHealthHandler(hfunc func() Health) http.HandlerFunc { +func NewHealthHandler(hfunc func(excludedAlarms AlarmSet) Health) http.HandlerFunc { return func(w http.ResponseWriter, r *http.Request) { if r.Method != http.MethodGet { w.Header().Set("Allow", http.MethodGet) @@ -53,7 +53,8 @@ func NewHealthHandler(hfunc func() Health) http.HandlerFunc { plog.Warningf("/health error (status code %d)", http.StatusMethodNotAllowed) return } - h := hfunc() + excludedAlarms := getExcludedAlarms(r) + h := hfunc(excludedAlarms) d, _ := json.Marshal(h) if h.Health != "true" { http.Error(w, string(d), http.StatusServiceUnavailable) @@ -90,19 +91,46 @@ type Health struct { Health string `json:"health"` } +type AlarmSet map[string]struct{} + +func getExcludedAlarms(r *http.Request) (alarms AlarmSet) { + alarms = make(map[string]struct{}, 2) + alms, found := r.URL.Query()["exclude"] + if found { + for _, alm := range alms { + if len(alms) == 0 { + continue + } + alarms[alm] = struct{}{} + } + } + return alarms +} + // TODO: server NOSPACE, etcdserver.ErrNoLeader in health API -func checkHealth(srv etcdserver.ServerV2) Health { +func checkHealth(srv etcdserver.ServerV2, excludedAlarms AlarmSet) Health { h := Health{Health: "true"} as := srv.Alarms() if len(as) > 0 { - h.Health = "false" for _, v := range as { - plog.Warningf("/health error due to an alarm %s", v.String()) + alarmName := v.Alarm.String() + if _, found := excludedAlarms[alarmName]; found { + plog.Debugf("/health excluded alarm %s", alarmName) + delete(excludedAlarms, alarmName) + continue + } + h.Health = "false" + plog.Warningf("/health error due to %s", v.String()) + return h } } + if len(excludedAlarms) > 0 { + plog.Warningf("fail exclude alarms from health check, exclude alarms %+v", excludedAlarms) + } + if h.Health == "true" { if uint64(srv.Leader()) == raft.None { h.Health = "false" diff --git a/etcdserver/api/etcdhttp/metrics_test.go b/etcdserver/api/etcdhttp/metrics_test.go new file mode 100644 index 00000000000..b24fec1158e --- /dev/null +++ b/etcdserver/api/etcdhttp/metrics_test.go @@ -0,0 +1,151 @@ +// Copyright 2021 The etcd Authors +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package etcdhttp + +import ( + "context" + "encoding/json" + "fmt" + "io" + "io/ioutil" + "net/http" + "net/http/httptest" + "testing" + + "go.etcd.io/etcd/etcdserver" + stats "go.etcd.io/etcd/etcdserver/api/v2stats" + pb "go.etcd.io/etcd/etcdserver/etcdserverpb" + "go.etcd.io/etcd/pkg/testutil" + "go.etcd.io/etcd/pkg/types" + "go.etcd.io/etcd/raft" +) + +type fakeStats struct{} + +func (s *fakeStats) SelfStats() []byte { return nil } +func (s *fakeStats) LeaderStats() []byte { return nil } +func (s *fakeStats) StoreStats() []byte { return nil } + +type fakeServerV2 struct { + fakeServer + stats.Stats + health string +} + +func (s *fakeServerV2) Leader() types.ID { + if s.health == "true" { + return 1 + } + return types.ID(raft.None) +} +func (s *fakeServerV2) Do(ctx context.Context, r pb.Request) (etcdserver.Response, error) { + if s.health == "true" { + return etcdserver.Response{}, nil + } + return etcdserver.Response{}, fmt.Errorf("fail health check") +} +func (s *fakeServerV2) ClientCertAuthEnabled() bool { return false } + +func TestHealthHandler(t *testing.T) { + // define the input and expected output + // input: alarms, and healthCheckURL + tests := []struct { + alarms []*pb.AlarmMember + healthCheckURL string + statusCode int + health string + }{ + { + []*pb.AlarmMember{}, + "/health", + http.StatusOK, + "true", + }, + { + []*pb.AlarmMember{{MemberID: uint64(0), Alarm: pb.AlarmType_NOSPACE}}, + "/health", + http.StatusServiceUnavailable, + "false", + }, + { + []*pb.AlarmMember{{MemberID: uint64(0), Alarm: pb.AlarmType_NOSPACE}}, + "/health?exclude=NOSPACE", + http.StatusOK, + "true", + }, + { + []*pb.AlarmMember{}, + "/health?exclude=NOSPACE", + http.StatusOK, + "true", + }, + { + []*pb.AlarmMember{{MemberID: uint64(0), Alarm: pb.AlarmType_NOSPACE}, {MemberID: uint64(1), Alarm: pb.AlarmType_CORRUPT}}, + "/health?exclude=NOSPACE", + http.StatusServiceUnavailable, + "false", + }, + { + []*pb.AlarmMember{{MemberID: uint64(0), Alarm: pb.AlarmType_NOSPACE}, {MemberID: uint64(1), Alarm: pb.AlarmType_CORRUPT}}, + "/health?exclude=NOSPACE&exclude=CORRUPT", + http.StatusOK, + "true", + }, + } + + for i, tt := range tests { + func() { + mux := http.NewServeMux() + HandleMetricsHealth(mux, &fakeServerV2{ + fakeServer: fakeServer{alarms: tt.alarms}, + Stats: &fakeStats{}, + health: tt.health, + }) + ts := httptest.NewServer(mux) + defer ts.Close() + + res, err := ts.Client().Do(&http.Request{Method: http.MethodGet, URL: testutil.MustNewURL(t, ts.URL+tt.healthCheckURL)}) + if err != nil { + t.Errorf("fail serve http request %s %v in test case #%d", tt.healthCheckURL, err, i+1) + } + if res == nil { + t.Errorf("got nil http response with http request %s in test case #%d", tt.healthCheckURL, i+1) + return + } + if res.StatusCode != tt.statusCode { + t.Errorf("want statusCode %d but got %d in test case #%d", tt.statusCode, res.StatusCode, i+1) + } + health, err := parseHealthOutput(res.Body) + if err != nil { + t.Errorf("fail parse health check output %v", err) + } + if health.Health != tt.health { + t.Errorf("want health %s but got %s", tt.health, health.Health) + } + }() + } +} + +func parseHealthOutput(body io.Reader) (Health, error) { + obj := Health{} + d, derr := ioutil.ReadAll(body) + if derr != nil { + return obj, derr + } + if err := json.Unmarshal(d, &obj); err != nil { + return obj, err + } + return obj, nil +} diff --git a/etcdserver/api/etcdhttp/peer_test.go b/etcdserver/api/etcdhttp/peer_test.go index 8a5a8c8c855..d39aa60b2ac 100644 --- a/etcdserver/api/etcdhttp/peer_test.go +++ b/etcdserver/api/etcdhttp/peer_test.go @@ -58,6 +58,7 @@ func (c *fakeCluster) Version() *semver.Version { return nil } type fakeServer struct { cluster api.Cluster + alarms []*pb.AlarmMember } func (s *fakeServer) AddMember(ctx context.Context, memb membership.Member) ([]*membership.Member, error) { @@ -74,7 +75,7 @@ func (s *fakeServer) PromoteMember(ctx context.Context, id uint64) ([]*membershi } func (s *fakeServer) ClusterVersion() *semver.Version { return nil } func (s *fakeServer) Cluster() api.Cluster { return s.cluster } -func (s *fakeServer) Alarms() []*pb.AlarmMember { return nil } +func (s *fakeServer) Alarms() []*pb.AlarmMember { return s.alarms } var fakeRaftHandler = http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { w.Write([]byte("test data")) diff --git a/proxy/grpcproxy/health.go b/proxy/grpcproxy/health.go index 4756cff2ed4..ad7358ec579 100644 --- a/proxy/grpcproxy/health.go +++ b/proxy/grpcproxy/health.go @@ -26,7 +26,7 @@ import ( // HandleHealth registers health handler on '/health'. func HandleHealth(mux *http.ServeMux, c *clientv3.Client) { - mux.Handle(etcdhttp.PathHealth, etcdhttp.NewHealthHandler(func() etcdhttp.Health { return checkHealth(c) })) + mux.Handle(etcdhttp.PathHealth, etcdhttp.NewHealthHandler(func(excludedAlarms etcdhttp.AlarmSet) etcdhttp.Health { return checkHealth(c) })) } func checkHealth(c *clientv3.Client) etcdhttp.Health {