Skip to content

Commit

Permalink
Merge pull request #5907 from yeya24/release-1.17.0-rc1
Browse files Browse the repository at this point in the history
Cherrypick commits to release-1.17 for new RC
  • Loading branch information
yeya24 committed Apr 28, 2024
2 parents cd3f7c6 + e9704d1 commit 7d0b1d3
Show file tree
Hide file tree
Showing 13 changed files with 130 additions and 89 deletions.
2 changes: 1 addition & 1 deletion CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -24,7 +24,7 @@
* [FEATURE] Ruler: Add `ruler.concurrent-evals-enabled` flag to enable concurrent evaluation within a single rule group for independent rules. Maximum concurrency can be configured via `ruler.max-concurrent-evals`. #5766
* [FEATURE] Distributor Queryable: Experimental: Add config `zone_results_quorum_metadata`. When querying ingesters using metadata APIs such as label names and values, only results from quorum number of zones will be included and merged. #5779
* [FEATURE] Storage Cache Clients: Add config `set_async_circuit_breaker_config` to utilize the circuit breaker pattern for dynamically thresholding asynchronous set operations. Implemented in both memcached and redis cache clients. #5789
* [FEATURE] Ruler: Add experimental `experimental.ruler.api-deduplicate-rules` flag to remove duplicate rule groups from the Prometheus compatible rules API endpoint. Add experimental `ruler.ring.replication-factor` and `ruler.ring.zone-awareness-enabled` flags to configure rule group replication, but only the first ruler in the replicaset evaluates the rule group, the rest will just hold a copy as backup. Add experimental `experimental.ruler.api-enable-rules-backup` flag to configure rulers to send the rule group backups stored in the replicaset to handle events when a ruler is down during an API request to list rules. #5782
* [FEATURE] Ruler: Add experimental `experimental.ruler.api-deduplicate-rules` flag to remove duplicate rule groups from the Prometheus compatible rules API endpoint. Add experimental `ruler.ring.replication-factor` and `ruler.ring.zone-awareness-enabled` flags to configure rule group replication, but only the first ruler in the replicaset evaluates the rule group, the rest will be used as backup to handle events when a ruler is down during an API request to list rules. #5782 #5901
* [FEATURE] Ring: Add experimental `-ingester.tokens-generator-strategy=minimize-spread` flag to enable the new minimize spread token generator strategy. #5855
* [FEATURE] Ring Status Page: Add `Ownership Diff From Expected` column in the ring table to indicate the extent to which the ownership of a specific ingester differs from the expected ownership. #5889
* [ENHANCEMENT] Ingester: Add per-tenant new metric `cortex_ingester_tsdb_data_replay_duration_seconds`. #5477
Expand Down
2 changes: 1 addition & 1 deletion VERSION
Original file line number Diff line number Diff line change
@@ -1 +1 @@
1.17.0-rc.0
1.17.0-rc.1
9 changes: 0 additions & 9 deletions docs/configuration/config-file-reference.md
Original file line number Diff line number Diff line change
Expand Up @@ -4271,15 +4271,6 @@ ring:
# CLI flag: -experimental.ruler.enable-api
[enable_api: <boolean> | default = false]

# EXPERIMENTAL: Enable rulers to store a copy of rules owned by other rulers
# with default state (state before any evaluation) and send this copy in list
# API requests as backup in case the ruler who owns the rule fails to send its
# rules. This allows the rules API to handle ruler outage by returning rules
# with default state. Ring replication-factor needs to be set to 2 or more for
# this to be useful.
# CLI flag: -experimental.ruler.api-enable-rules-backup
[api_enable_rules_backup: <boolean> | default = false]

# EXPERIMENTAL: Remove duplicate rules in the prometheus rules and alerts API
# response. If there are duplicate rules the rule with the latest evaluation
# timestamp will be kept.
Expand Down
9 changes: 4 additions & 5 deletions integration/ruler_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -402,7 +402,7 @@ func TestRulerAPIShardingWithAPIRulesBackupEnabled(t *testing.T) {
testRulerAPIWithSharding(t, true)
}

func testRulerAPIWithSharding(t *testing.T, enableAPIRulesBackup bool) {
func testRulerAPIWithSharding(t *testing.T, enableRulesBackup bool) {
const numRulesGroups = 100

random := rand.New(rand.NewSource(time.Now().UnixNano()))
Expand Down Expand Up @@ -459,9 +459,8 @@ func testRulerAPIWithSharding(t *testing.T, enableAPIRulesBackup bool) {
// Enable the bucket index so we can skip the initial bucket scan.
"-blocks-storage.bucket-store.bucket-index.enabled": "true",
}
if enableAPIRulesBackup {
if enableRulesBackup {
overrides["-ruler.ring.replication-factor"] = "3"
overrides["-experimental.ruler.api-enable-rules-backup"] = "true"
}
rulerFlags := mergeFlags(
BlocksStorageFlags(),
Expand Down Expand Up @@ -556,8 +555,8 @@ func testRulerAPIWithSharding(t *testing.T, enableAPIRulesBackup bool) {
},
}
// For each test case, fetch the rules with configured filters, and ensure the results match.
if enableAPIRulesBackup {
err := ruler2.Kill() // if api-enable-rules-backup is enabled the APIs should be able to handle a ruler going down
if enableRulesBackup {
err := ruler2.Kill() // if rules backup is enabled the APIs should be able to handle a ruler going down
require.NoError(t, err)
}
for name, tc := range testCases {
Expand Down
12 changes: 10 additions & 2 deletions pkg/distributor/distributor_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -57,6 +57,14 @@ var (
emptyResponse = &cortexpb.WriteResponse{}
)

var (
randomStrings = []string{}
)

func init() {
randomStrings = util.GenerateRandomStrings()
}

func TestConfig_Validate(t *testing.T) {
t.Parallel()
tests := map[string]struct {
Expand Down Expand Up @@ -2466,8 +2474,8 @@ func prepare(tb testing.TB, cfg prepConfig) ([]*Distributor, []*mockIngester, []
// Strings to be used for get labels values/Names
var unusedStrings []string
if cfg.lblValuesPerIngester > 0 {
unusedStrings = make([]string, min(len(util.RandomStrings), cfg.numIngesters*cfg.lblValuesPerIngester))
copy(unusedStrings, util.RandomStrings)
unusedStrings = make([]string, min(len(randomStrings), cfg.numIngesters*cfg.lblValuesPerIngester))
copy(unusedStrings, randomStrings)
}
s := &prepState{
unusedStrings: unusedStrings,
Expand Down
2 changes: 1 addition & 1 deletion pkg/ruler/manager.go
Original file line number Diff line number Diff line change
Expand Up @@ -116,7 +116,7 @@ func NewDefaultMultiTenantManager(cfg Config, managerFactory ManagerFactory, eva
registry: reg,
logger: logger,
}
if cfg.APIEnableRulesBackup {
if cfg.RulesBackupEnabled() {
m.rulesBackupManager = newRulesBackupManager(cfg, logger, reg)
}
return m, nil
Expand Down
4 changes: 3 additions & 1 deletion pkg/ruler/manager_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -262,7 +262,9 @@ func TestBackupRules(t *testing.T) {
1 * time.Millisecond,
}
ruleManagerFactory := RuleManagerFactory(nil, waitDurations)
m, err := NewDefaultMultiTenantManager(Config{RulePath: dir, APIEnableRulesBackup: true}, ruleManagerFactory, evalMetrics, reg, log.NewNopLogger())
config := Config{RulePath: dir}
config.Ring.ReplicationFactor = 3
m, err := NewDefaultMultiTenantManager(config, ruleManagerFactory, evalMetrics, reg, log.NewNopLogger())
require.NoError(t, err)

const user1 = "testUser"
Expand Down
30 changes: 17 additions & 13 deletions pkg/ruler/ruler.go
Original file line number Diff line number Diff line change
Expand Up @@ -130,9 +130,8 @@ type Config struct {
Ring RingConfig `yaml:"ring"`
FlushCheckPeriod time.Duration `yaml:"flush_period"`

EnableAPI bool `yaml:"enable_api"`
APIEnableRulesBackup bool `yaml:"api_enable_rules_backup"`
APIDeduplicateRules bool `yaml:"api_deduplicate_rules"`
EnableAPI bool `yaml:"enable_api"`
APIDeduplicateRules bool `yaml:"api_deduplicate_rules"`

EnabledTenants flagext.StringSliceCSV `yaml:"enabled_tenants"`
DisabledTenants flagext.StringSliceCSV `yaml:"disabled_tenants"`
Expand Down Expand Up @@ -200,7 +199,6 @@ func (cfg *Config) RegisterFlags(f *flag.FlagSet) {
f.DurationVar(&cfg.FlushCheckPeriod, "ruler.flush-period", 1*time.Minute, "Period with which to attempt to flush rule groups.")
f.StringVar(&cfg.RulePath, "ruler.rule-path", "/rules", "file path to store temporary rule files for the prometheus rule managers")
f.BoolVar(&cfg.EnableAPI, "experimental.ruler.enable-api", false, "Enable the ruler api")
f.BoolVar(&cfg.APIEnableRulesBackup, "experimental.ruler.api-enable-rules-backup", false, "EXPERIMENTAL: Enable rulers to store a copy of rules owned by other rulers with default state (state before any evaluation) and send this copy in list API requests as backup in case the ruler who owns the rule fails to send its rules. This allows the rules API to handle ruler outage by returning rules with default state. Ring replication-factor needs to be set to 2 or more for this to be useful.")
f.BoolVar(&cfg.APIDeduplicateRules, "experimental.ruler.api-deduplicate-rules", false, "EXPERIMENTAL: Remove duplicate rules in the prometheus rules and alerts API response. If there are duplicate rules the rule with the latest evaluation timestamp will be kept.")
f.DurationVar(&cfg.OutageTolerance, "ruler.for-outage-tolerance", time.Hour, `Max time to tolerate outage for restoring "for" state of alert.`)
f.DurationVar(&cfg.ForGracePeriod, "ruler.for-grace-period", 10*time.Minute, `Minimum duration between alert and restored "for" state. This is maintained only for alerts with configured "for" time greater than grace period.`)
Expand All @@ -217,6 +215,12 @@ func (cfg *Config) RegisterFlags(f *flag.FlagSet) {
cfg.RingCheckPeriod = 5 * time.Second
}

func (cfg *Config) RulesBackupEnabled() bool {
// If the replication factor is greater the 1, only the first replica is responsible for evaluating the rule,
// the rest of the replica will store the rule groups as backup only for API HA.
return cfg.Ring.ReplicationFactor > 1
}

// MultiTenantManager is the interface of interaction with a Manager that is tenant aware.
type MultiTenantManager interface {
// SyncRuleGroups is used to sync the Manager with rules from the RuleStore.
Expand Down Expand Up @@ -581,7 +585,7 @@ func (r *Ruler) syncRules(ctx context.Context, reason string) {
// This will also delete local group files for users that are no longer in 'configs' map.
r.manager.SyncRuleGroups(ctx, loadedConfigs)

if r.cfg.APIEnableRulesBackup {
if r.cfg.RulesBackupEnabled() {
r.manager.BackUpRuleGroups(ctx, backupConfigs)
}
}
Expand All @@ -604,7 +608,7 @@ func (r *Ruler) loadRuleGroups(ctx context.Context) (map[string]rulespb.RuleGrou
if err != nil {
level.Warn(r.logger).Log("msg", "failed to load some rules owned by this ruler", "count", len(ownedConfigs)-len(loadedOwnedConfigs), "err", err)
}
if r.cfg.APIEnableRulesBackup {
if r.cfg.RulesBackupEnabled() {
loadedBackupConfigs, err := r.store.LoadRuleGroups(ctx, backupConfigs)
if err != nil {
level.Warn(r.logger).Log("msg", "failed to load some rules backed up by this ruler", "count", len(backupConfigs)-len(loadedBackupConfigs), "err", err)
Expand Down Expand Up @@ -685,7 +689,7 @@ func (r *Ruler) listRulesShardingDefault(ctx context.Context) (map[string]rulesp
if len(owned) > 0 {
ownedConfigs[userID] = owned
}
if r.cfg.APIEnableRulesBackup {
if r.cfg.RulesBackupEnabled() {
backup := filterBackupRuleGroups(userID, groups, r.limits.DisabledRuleGroups(userID), r.ring, r.lifecycler.GetInstanceAddr(), r.logger, r.ringCheckErrors)
if len(backup) > 0 {
backedUpConfigs[userID] = backup
Expand Down Expand Up @@ -748,7 +752,7 @@ func (r *Ruler) listRulesShuffleSharding(ctx context.Context) (map[string]rulesp

filterOwned := filterRuleGroups(userID, groups, r.limits.DisabledRuleGroups(userID), userRings[userID], r.lifecycler.GetInstanceAddr(), r.logger, r.ringCheckErrors)
var filterBackup []*rulespb.RuleGroupDesc
if r.cfg.APIEnableRulesBackup {
if r.cfg.RulesBackupEnabled() {
filterBackup = filterBackupRuleGroups(userID, groups, r.limits.DisabledRuleGroups(userID), userRings[userID], r.lifecycler.GetInstanceAddr(), r.logger, r.ringCheckErrors)
}
if len(filterOwned) == 0 && len(filterBackup) == 0 {
Expand Down Expand Up @@ -1121,7 +1125,7 @@ func (r *Ruler) getShardedRules(ctx context.Context, userID string, rulesRequest
)

zoneByAddress := make(map[string]string)
if r.cfg.APIEnableRulesBackup {
if r.cfg.RulesBackupEnabled() {
for _, ruler := range rulers.Instances {
zoneByAddress[ruler.Addr] = ruler.Zone
}
Expand All @@ -1146,9 +1150,9 @@ func (r *Ruler) getShardedRules(ctx context.Context, userID string, rulesRequest
if err != nil {
level.Error(r.logger).Log("msg", "unable to retrieve rules from ruler", "addr", addr, "err", err)
r.rulerGetRulesFailures.WithLabelValues(addr).Inc()
// If APIEnableRulesBackup is enabled and there are enough rulers replicating the rules, we should
// If rules backup is enabled and there are enough rulers replicating the rules, we should
// be able to handle failures.
if r.cfg.APIEnableRulesBackup && len(jobs) >= r.cfg.Ring.ReplicationFactor {
if r.cfg.RulesBackupEnabled() && len(jobs) >= r.cfg.Ring.ReplicationFactor {
mtx.Lock()
failedZones[zoneByAddress[addr]] = struct{}{}
errCount += 1
Expand All @@ -1168,7 +1172,7 @@ func (r *Ruler) getShardedRules(ctx context.Context, userID string, rulesRequest
return nil
})

if err == nil && (r.cfg.APIEnableRulesBackup || r.cfg.APIDeduplicateRules) {
if err == nil && (r.cfg.RulesBackupEnabled() || r.cfg.APIDeduplicateRules) {
merged = mergeGroupStateDesc(merged)
}

Expand All @@ -1183,7 +1187,7 @@ func (r *Ruler) Rules(ctx context.Context, in *RulesRequest) (*RulesResponse, er
return nil, fmt.Errorf("no user id found in context")
}

groupDescs, err := r.getLocalRules(userID, *in, r.cfg.APIEnableRulesBackup)
groupDescs, err := r.getLocalRules(userID, *in, r.cfg.RulesBackupEnabled())
if err != nil {
return nil, err
}
Expand Down
5 changes: 4 additions & 1 deletion pkg/ruler/ruler_ring.go
Original file line number Diff line number Diff line change
Expand Up @@ -144,7 +144,10 @@ func GetReplicationSetForListRule(r ring.ReadRing, cfg *RingConfig) (ring.Replic
// to 0, and then we update them whether zone-awareness is enabled or not.
maxErrors := 0
maxUnavailableZones := 0
if cfg.ZoneAwarenessEnabled {
// Because ring's Get method returns a number of ruler equal to the replication factor even if there is only 1 zone
// and ZoneAwarenessEnabled, we can consider that ZoneAwarenessEnabled is disabled if there is only 1 zone since
// rules are still replicated to rulers in the same zone.
if cfg.ZoneAwarenessEnabled && len(ringZones) > 1 {
numReplicatedZones := min(len(ringZones), r.ReplicationFactor())
// Given that quorum is not required, we only need at least one of the zone to be healthy to succeed. But we
// also need to handle case when RF < number of zones.
Expand Down
17 changes: 17 additions & 0 deletions pkg/ruler/ruler_ring_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -180,6 +180,23 @@ func TestGetReplicationSetForListRule(t *testing.T) {
"z2": {},
},
},
"should succeed on 1 unhealthy instances in RF=3 zone replication enabled but only 1 zone": {
ringInstances: map[string]ring.InstanceDesc{
"instance-1": {Addr: "127.0.0.1", State: ring.ACTIVE, Timestamp: now.Unix(), Tokens: g.GenerateTokens(ring.NewDesc(), "instance-1", "z1", 128, true), Zone: "z1"},
"instance-2": {Addr: "127.0.0.2", State: ring.ACTIVE, Timestamp: now.Add(-10 * time.Second).Unix(), Tokens: g.GenerateTokens(ring.NewDesc(), "instance-2", "z1", 128, true), Zone: "z1"},
"instance-3": {Addr: "127.0.0.3", State: ring.ACTIVE, Timestamp: now.Add(-20 * time.Second).Unix(), Tokens: g.GenerateTokens(ring.NewDesc(), "instance-3", "z1", 128, true), Zone: "z1"},
"instance-4": {Addr: "127.0.0.4", State: ring.PENDING, Timestamp: now.Add(-30 * time.Second).Unix(), Tokens: g.GenerateTokens(ring.NewDesc(), "instance-4", "z1", 128, true), Zone: "z1"},
},
ringHeartbeatTimeout: time.Minute,
ringReplicationFactor: 3,
expectedSet: []string{"127.0.0.1", "127.0.0.2", "127.0.0.3"},
enableAZReplication: true,
expectedFailedZones: map[string]struct{}{
"z1": {},
},
expectedMaxUnavailableZones: 0,
expectedMaxError: 1,
},
}

for testName, testData := range tests {
Expand Down

0 comments on commit 7d0b1d3

Please sign in to comment.