From 0249305ecbed190c944e1a971f78f6096d107153 Mon Sep 17 00:00:00 2001 From: Yingrong Zhao <22300958+VinozzZ@users.noreply.github.com> Date: Mon, 16 Sep 2024 11:33:24 -0400 Subject: [PATCH 01/25] feat: extract key fields from rules config (#1327) ## Which problem is this PR solving? Extract sampling fields from config so that we can use it to construct key fields only spans fixes: #1325 ## Short description of the changes - add `GetSamplingFielder` interface to sampler config - add `GetKeyFields` to all samplers --- config/sampler_config.go | 103 +++++++++++++++++ sample/deterministic.go | 4 + sample/dynamic.go | 8 +- sample/dynamic_ema.go | 8 +- sample/ema_throughput.go | 8 +- sample/rules.go | 16 ++- sample/rules_test.go | 204 +++++++++++++++++++++------------- sample/sample.go | 1 + sample/totalthroughput.go | 8 +- sample/windowed_throughput.go | 7 +- 10 files changed, 280 insertions(+), 87 deletions(-) diff --git a/config/sampler_config.go b/config/sampler_config.go index 37917b2a32..dfe21902d6 100644 --- a/config/sampler_config.go +++ b/config/sampler_config.go @@ -149,10 +149,22 @@ type V2SamplerConfig struct { Samplers map[string]*V2SamplerChoice `json:"samplers" yaml:"Samplers,omitempty" validate:"required"` } +type GetSamplingFielder interface { + GetSamplingFields() []string +} + +var _ GetSamplingFielder = (*DeterministicSamplerConfig)(nil) + type DeterministicSamplerConfig struct { SampleRate int `json:"samplerate" yaml:"SampleRate,omitempty" default:"1" validate:"required,gte=1"` } +func (d *DeterministicSamplerConfig) GetSamplingFields() []string { + return nil +} + +var _ GetSamplingFielder = (*DynamicSamplerConfig)(nil) + type DynamicSamplerConfig struct { SampleRate int64 `json:"samplerate" yaml:"SampleRate,omitempty" validate:"required,gte=1"` ClearFrequency Duration `json:"clearfrequency" yaml:"ClearFrequency,omitempty"` @@ -161,6 +173,12 @@ type DynamicSamplerConfig struct { UseTraceLength bool `json:"usetracelength" yaml:"UseTraceLength,omitempty"` } +func (d *DynamicSamplerConfig) GetSamplingFields() []string { + return d.FieldList +} + +var _ GetSamplingFielder = (*EMADynamicSamplerConfig)(nil) + type EMADynamicSamplerConfig struct { GoalSampleRate int `json:"goalsamplerate" yaml:"GoalSampleRate,omitempty" validate:"gte=1"` AdjustmentInterval Duration `json:"adjustmentinterval" yaml:"AdjustmentInterval,omitempty"` @@ -173,6 +191,12 @@ type EMADynamicSamplerConfig struct { UseTraceLength bool `json:"usetracelength" yaml:"UseTraceLength,omitempty"` } +func (d *EMADynamicSamplerConfig) GetSamplingFields() []string { + return d.FieldList +} + +var _ GetSamplingFielder = (*EMAThroughputSamplerConfig)(nil) + type EMAThroughputSamplerConfig struct { GoalThroughputPerSec int `json:"goalthroughputpersec" yaml:"GoalThroughputPerSec,omitempty"` UseClusterSize bool `json:"useclustersize" yaml:"UseClusterSize,omitempty"` @@ -187,6 +211,12 @@ type EMAThroughputSamplerConfig struct { UseTraceLength bool `json:"usetracelength" yaml:"UseTraceLength,omitempty"` } +func (d *EMAThroughputSamplerConfig) GetSamplingFields() []string { + return d.FieldList +} + +var _ GetSamplingFielder = (*WindowedThroughputSamplerConfig)(nil) + type WindowedThroughputSamplerConfig struct { UpdateFrequency Duration `json:"updatefrequency" yaml:"UpdateFrequency,omitempty"` LookbackFrequency Duration `json:"lookbackfrequency" yaml:"LookbackFrequency,omitempty"` @@ -197,6 +227,12 @@ type WindowedThroughputSamplerConfig struct { UseTraceLength bool `json:"usetracelength" yaml:"UseTraceLength,omitempty"` } +func (d *WindowedThroughputSamplerConfig) GetSamplingFields() []string { + return d.FieldList +} + +var _ GetSamplingFielder = (*TotalThroughputSamplerConfig)(nil) + type TotalThroughputSamplerConfig struct { GoalThroughputPerSec int `json:"goalthroughputpersec" yaml:"GoalThroughputPerSec,omitempty" validate:"gte=1"` UseClusterSize bool `json:"useclustersize" yaml:"UseClusterSize,omitempty"` @@ -206,12 +242,50 @@ type TotalThroughputSamplerConfig struct { UseTraceLength bool `json:"usetracelength" yaml:"UseTraceLength,omitempty"` } +func (d *TotalThroughputSamplerConfig) GetSamplingFields() []string { + return d.FieldList +} + +var _ GetSamplingFielder = (*RulesBasedSamplerConfig)(nil) + type RulesBasedSamplerConfig struct { // Rules has deliberately different names for json and yaml for conversion from old to new format Rules []*RulesBasedSamplerRule `json:"rule" yaml:"Rules,omitempty"` CheckNestedFields bool `json:"checknestedfields" yaml:"CheckNestedFields,omitempty"` } +func (r *RulesBasedSamplerConfig) GetSamplingFields() []string { + fields := make(generics.Set[string], 0) + + for _, rule := range r.Rules { + if rule == nil { + continue + } + + for _, condition := range rule.Conditions { + // Field and Fields are mutually exclusive, so we only need to check one. + if condition.Fields != nil { + fields.Add(condition.Fields...) + continue + } + + if condition.Field != "" { + fields.Add(condition.Field) + } + } + + if rule.Sampler != nil { + fields.Add(rule.Sampler.GetSamplingFields()...) + } + } + + return fields.Members() +} + +var _ GetSamplingFielder = (*RulesBasedDownstreamSampler)(nil) + +// RulesBasedDownstreamSampler is a sampler that can be used as a downstream sampler in a rules-based sampler. +// Only one of the fields should be set. type RulesBasedDownstreamSampler struct { DynamicSampler *DynamicSamplerConfig `json:"dynamicsampler" yaml:"DynamicSampler,omitempty"` EMADynamicSampler *EMADynamicSamplerConfig `json:"emadynamicsampler" yaml:"EMADynamicSampler,omitempty"` @@ -221,6 +295,35 @@ type RulesBasedDownstreamSampler struct { DeterministicSampler *DeterministicSamplerConfig `json:"deterministicsampler" yaml:"DeterministicSampler,omitempty"` } +func (r *RulesBasedDownstreamSampler) GetSamplingFields() []string { + + if r.DeterministicSampler != nil { + return r.DeterministicSampler.GetSamplingFields() + } + + if r.DynamicSampler != nil { + return r.DynamicSampler.GetSamplingFields() + } + + if r.EMADynamicSampler != nil { + return r.EMADynamicSampler.GetSamplingFields() + } + + if r.EMAThroughputSampler != nil { + return r.EMAThroughputSampler.GetSamplingFields() + } + + if r.WindowedThroughputSampler != nil { + return r.WindowedThroughputSampler.GetSamplingFields() + } + + if r.TotalThroughputSampler != nil { + return r.TotalThroughputSampler.GetSamplingFields() + } + + return []string{} +} + type RulesBasedSamplerRule struct { // Conditions has deliberately different names for json and yaml for conversion from old to new format Name string `json:"name" yaml:"Name,omitempty"` diff --git a/sample/deterministic.go b/sample/deterministic.go index 1d2e5a44c4..c51500ddfe 100644 --- a/sample/deterministic.go +++ b/sample/deterministic.go @@ -57,3 +57,7 @@ func (d *DeterministicSampler) GetSampleRate(trace *types.Trace) (rate uint, kee return uint(d.sampleRate), shouldKeep, "deterministic/chance", "" } + +func (d *DeterministicSampler) GetKeyFields() []string { + return d.Config.GetSamplingFields() +} diff --git a/sample/dynamic.go b/sample/dynamic.go index 9637483281..9d39bb1c27 100644 --- a/sample/dynamic.go +++ b/sample/dynamic.go @@ -23,7 +23,8 @@ type DynamicSampler struct { prefix string lastMetrics map[string]int64 - key *traceKey + key *traceKey + keyFields []string dynsampler dynsampler.Sampler } @@ -42,6 +43,7 @@ func (d *DynamicSampler) Start() error { d.maxKeys = 500 } d.prefix = "dynamic_" + d.keyFields = d.Config.GetSamplingFields() // spin up the actual dynamic sampler d.dynsampler = &dynsampler.AvgSampleRate{ @@ -99,3 +101,7 @@ func (d *DynamicSampler) GetSampleRate(trace *types.Trace) (rate uint, keep bool } return rate, shouldKeep, "dynamic", key } + +func (d *DynamicSampler) GetKeyFields() []string { + return d.keyFields +} diff --git a/sample/dynamic_ema.go b/sample/dynamic_ema.go index c1b5c550b5..758ce23f55 100644 --- a/sample/dynamic_ema.go +++ b/sample/dynamic_ema.go @@ -27,7 +27,8 @@ type EMADynamicSampler struct { prefix string lastMetrics map[string]int64 - key *traceKey + key *traceKey + keyFields []string dynsampler dynsampler.Sampler } @@ -47,6 +48,7 @@ func (d *EMADynamicSampler) Start() error { d.maxKeys = 500 } d.prefix = "emadynamic_" + d.keyFields = d.Config.GetSamplingFields() // spin up the actual dynamic sampler d.dynsampler = &dynsampler.EMASampleRate{ @@ -107,3 +109,7 @@ func (d *EMADynamicSampler) GetSampleRate(trace *types.Trace) (rate uint, keep b } return rate, shouldKeep, "emadynamic", key } + +func (d *EMADynamicSampler) GetKeyFields() []string { + return d.keyFields +} diff --git a/sample/ema_throughput.go b/sample/ema_throughput.go index 58e5c2ad8a..e3b3bd7ce9 100644 --- a/sample/ema_throughput.go +++ b/sample/ema_throughput.go @@ -30,7 +30,8 @@ type EMAThroughputSampler struct { prefix string lastMetrics map[string]int64 - key *traceKey + key *traceKey + keyFields []string dynsampler *dynsampler.EMAThroughput } @@ -56,6 +57,7 @@ func (d *EMAThroughputSampler) Start() error { } d.prefix = "emathroughput_" + d.keyFields = d.Config.GetSamplingFields() // spin up the actual dynamic sampler d.dynsampler = &dynsampler.EMAThroughput{ GoalThroughputPerSec: d.goalThroughputPerSec / d.clusterSize, @@ -124,3 +126,7 @@ func (d *EMAThroughputSampler) GetSampleRate(trace *types.Trace) (rate uint, kee } return rate, shouldKeep, "emathroughput", key } + +func (d *EMAThroughputSampler) GetKeyFields() []string { + return d.keyFields +} diff --git a/sample/rules.go b/sample/rules.go index 3c48709f6f..98c2f5df56 100644 --- a/sample/rules.go +++ b/sample/rules.go @@ -15,11 +15,12 @@ import ( var _ ClusterSizer = (*RulesBasedSampler)(nil) type RulesBasedSampler struct { - Config *config.RulesBasedSamplerConfig - Logger logger.Logger - Metrics metrics.Metrics - samplers map[string]Sampler - prefix string + Config *config.RulesBasedSamplerConfig + Logger logger.Logger + Metrics metrics.Metrics + samplers map[string]Sampler + prefix string + keyFields []string } const RootPrefix = "root." @@ -35,6 +36,7 @@ func (s *RulesBasedSampler) Start() error { s.Metrics.Register(s.prefix+"sample_rate", "histogram") s.samplers = make(map[string]Sampler) + s.keyFields = s.Config.GetSamplingFields() for _, rule := range s.Config.Rules { for _, cond := range rule.Conditions { @@ -162,6 +164,10 @@ func (s *RulesBasedSampler) GetSampleRate(trace *types.Trace) (rate uint, keep b return 1, true, "no rule matched", "" } +func (s *RulesBasedSampler) GetKeyFields() []string { + return s.keyFields +} + func ruleMatchesTrace(t *types.Trace, rule *config.RulesBasedSamplerRule, checkNestedFields bool) bool { // We treat a rule with no conditions as a match. if rule.Conditions == nil { diff --git a/sample/rules_test.go b/sample/rules_test.go index d4ec68bd32..b9d70be913 100644 --- a/sample/rules_test.go +++ b/sample/rules_test.go @@ -2,6 +2,7 @@ package sample import ( "fmt" + "slices" "testing" "github.com/honeycombio/refinery/config" @@ -17,9 +18,10 @@ type TestRulesData struct { Spans []*types.Span // Set to the matching rule's sample rate if the rule matches. // Set to the default rate (1) if you expect no rule to match. - ExpectedRate uint - ExpectedKeep bool - ExpectedName string + ExpectedRate uint + ExpectedKeep bool + ExpectedName string + ExpectedKeyFields []string } func TestRules(t *testing.T) { @@ -49,8 +51,9 @@ func TestRules(t *testing.T) { }, }, }, - ExpectedKeep: true, - ExpectedRate: 10, + ExpectedKeep: true, + ExpectedRate: 10, + ExpectedKeyFields: []string{"test"}, }, { Rules: &config.RulesBasedSamplerConfig{ @@ -77,8 +80,9 @@ func TestRules(t *testing.T) { }, }, }, - ExpectedKeep: true, - ExpectedRate: 10, + ExpectedKeep: true, + ExpectedRate: 10, + ExpectedKeyFields: []string{"test"}, }, { Rules: &config.RulesBasedSamplerConfig{ @@ -105,8 +109,9 @@ func TestRules(t *testing.T) { }, }, }, - ExpectedKeep: true, - ExpectedRate: 10, + ExpectedKeep: true, + ExpectedRate: 10, + ExpectedKeyFields: []string{"test"}, }, { Rules: &config.RulesBasedSamplerConfig{ @@ -133,8 +138,9 @@ func TestRules(t *testing.T) { }, }, }, - ExpectedKeep: true, - ExpectedRate: 10, + ExpectedKeep: true, + ExpectedRate: 10, + ExpectedKeyFields: []string{"test"}, }, { Rules: &config.RulesBasedSamplerConfig{ @@ -165,9 +171,10 @@ func TestRules(t *testing.T) { }, }, }, - ExpectedKeep: true, - ExpectedRate: 10, - ExpectedName: "fallback", + ExpectedKeep: true, + ExpectedRate: 10, + ExpectedName: "fallback", + ExpectedKeyFields: []string{"test"}, }, { Rules: &config.RulesBasedSamplerConfig{ @@ -212,8 +219,9 @@ func TestRules(t *testing.T) { }, }, }, - ExpectedKeep: true, - ExpectedRate: 10, + ExpectedKeep: true, + ExpectedRate: 10, + ExpectedKeyFields: []string{"test", "test_two"}, }, { Rules: &config.RulesBasedSamplerConfig{ @@ -240,8 +248,9 @@ func TestRules(t *testing.T) { }, }, }, - ExpectedKeep: false, - ExpectedRate: 0, + ExpectedKeep: false, + ExpectedRate: 0, + ExpectedKeyFields: []string{"test"}, }, { Rules: &config.RulesBasedSamplerConfig{ @@ -261,8 +270,9 @@ func TestRules(t *testing.T) { }, }, }, - ExpectedKeep: false, - ExpectedRate: 0, + ExpectedKeep: false, + ExpectedRate: 0, + ExpectedKeyFields: []string{}, }, { Rules: &config.RulesBasedSamplerConfig{ @@ -303,8 +313,9 @@ func TestRules(t *testing.T) { }, ExpectedKeep: true, // the trace does not match all the rules so we expect the default sample rate - ExpectedRate: 1, - ExpectedName: "no rule matched", + ExpectedRate: 1, + ExpectedName: "no rule matched", + ExpectedKeyFields: []string{"first", "second"}, }, { Rules: &config.RulesBasedSamplerConfig{ @@ -331,8 +342,9 @@ func TestRules(t *testing.T) { }, }, }, - ExpectedKeep: true, - ExpectedRate: 4, + ExpectedKeep: true, + ExpectedRate: 4, + ExpectedKeyFields: []string{"first"}, }, { Rules: &config.RulesBasedSamplerConfig{ @@ -358,8 +370,9 @@ func TestRules(t *testing.T) { }, }, }, - ExpectedKeep: true, - ExpectedRate: 4, + ExpectedKeep: true, + ExpectedRate: 4, + ExpectedKeyFields: []string{"first"}, }, { Rules: &config.RulesBasedSamplerConfig{ @@ -385,8 +398,9 @@ func TestRules(t *testing.T) { }, }, }, - ExpectedKeep: true, - ExpectedRate: 4, + ExpectedKeep: true, + ExpectedRate: 4, + ExpectedKeyFields: []string{"first"}, }, { Rules: &config.RulesBasedSamplerConfig{ @@ -413,8 +427,9 @@ func TestRules(t *testing.T) { }, }, }, - ExpectedKeep: true, - ExpectedRate: 4, + ExpectedKeep: true, + ExpectedRate: 4, + ExpectedKeyFields: []string{"first"}, }, { Rules: &config.RulesBasedSamplerConfig{ @@ -441,8 +456,9 @@ func TestRules(t *testing.T) { }, }, }, - ExpectedKeep: true, - ExpectedRate: 4, + ExpectedKeep: true, + ExpectedRate: 4, + ExpectedKeyFields: []string{"first"}, }, { Rules: &config.RulesBasedSamplerConfig{ @@ -469,8 +485,9 @@ func TestRules(t *testing.T) { }, }, }, - ExpectedKeep: true, - ExpectedRate: 4, + ExpectedKeep: true, + ExpectedRate: 4, + ExpectedKeyFields: []string{"first"}, }, { Rules: &config.RulesBasedSamplerConfig{ @@ -497,8 +514,9 @@ func TestRules(t *testing.T) { }, }, }, - ExpectedKeep: true, - ExpectedRate: 10, + ExpectedKeep: true, + ExpectedRate: 10, + ExpectedKeyFields: []string{"test"}, }, { Rules: &config.RulesBasedSamplerConfig{ @@ -538,9 +556,10 @@ func TestRules(t *testing.T) { }, }, }, - ExpectedName: "Check root span for span count", - ExpectedKeep: true, - ExpectedRate: 1, + ExpectedName: "Check root span for span count", + ExpectedKeep: true, + ExpectedRate: 1, + ExpectedKeyFields: []string{"meta.span_count"}, }, { Rules: &config.RulesBasedSamplerConfig{ @@ -591,9 +610,10 @@ func TestRules(t *testing.T) { }, }, }, - ExpectedName: "Check root span for span count", - ExpectedKeep: false, - ExpectedRate: 0, + ExpectedName: "Check root span for span count", + ExpectedKeep: false, + ExpectedRate: 0, + ExpectedKeyFields: []string{"meta.span_count"}, }, { Rules: &config.RulesBasedSamplerConfig{ @@ -633,9 +653,10 @@ func TestRules(t *testing.T) { }, }, }, - ExpectedName: "Check that root span is missing", - ExpectedKeep: false, - ExpectedRate: 0, + ExpectedName: "Check that root span is missing", + ExpectedKeep: false, + ExpectedRate: 0, + ExpectedKeyFields: []string{}, }, { Rules: &config.RulesBasedSamplerConfig{ @@ -684,9 +705,10 @@ func TestRules(t *testing.T) { }, }, }, - ExpectedName: "Check that root span is present", - ExpectedKeep: true, - ExpectedRate: 99, + ExpectedName: "Check that root span is present", + ExpectedKeep: true, + ExpectedRate: 99, + ExpectedKeyFields: []string{}, }, { Rules: &config.RulesBasedSamplerConfig{ @@ -713,8 +735,9 @@ func TestRules(t *testing.T) { }, }, }, - ExpectedKeep: true, - ExpectedRate: 10, + ExpectedKeep: true, + ExpectedRate: 10, + ExpectedKeyFields: []string{"test", "test2"}, }, { Rules: &config.RulesBasedSamplerConfig{ @@ -741,9 +764,10 @@ func TestRules(t *testing.T) { }, }, }, - ExpectedKeep: true, - ExpectedName: "no rule matched", - ExpectedRate: 1, + ExpectedKeep: true, + ExpectedName: "no rule matched", + ExpectedRate: 1, + ExpectedKeyFields: []string{"test", "test2"}, }, { Rules: &config.RulesBasedSamplerConfig{ @@ -771,9 +795,10 @@ func TestRules(t *testing.T) { }, }, }, - ExpectedKeep: true, - ExpectedName: "no rule matched", - ExpectedRate: 1, + ExpectedKeep: true, + ExpectedName: "no rule matched", + ExpectedRate: 1, + ExpectedKeyFields: []string{"test", "test2"}, }, { Rules: &config.RulesBasedSamplerConfig{ @@ -832,9 +857,10 @@ func TestRules(t *testing.T) { }, }, }, - ExpectedName: "Check that the number of descendants is greater than 3", - ExpectedKeep: false, - ExpectedRate: 1, + ExpectedName: "Check that the number of descendants is greater than 3", + ExpectedKeep: false, + ExpectedRate: 1, + ExpectedKeyFields: []string{string(config.NUM_DESCENDANTS)}, }, { Rules: &config.RulesBasedSamplerConfig{ @@ -891,9 +917,10 @@ func TestRules(t *testing.T) { }, }, }, - ExpectedName: "no rule matched", - ExpectedKeep: true, - ExpectedRate: 1, + ExpectedName: "no rule matched", + ExpectedKeep: true, + ExpectedRate: 1, + ExpectedKeyFields: []string{string(config.NUM_DESCENDANTS)}, }, } @@ -919,6 +946,7 @@ func TestRules(t *testing.T) { } } + sampler.Start() rate, keep, reason, key := sampler.GetSampleRate(trace) assert.Equal(t, d.ExpectedRate, rate, d.Rules) @@ -929,6 +957,10 @@ func TestRules(t *testing.T) { assert.Contains(t, reason, name) assert.Equal(t, "", key) + keyFields := sampler.GetKeyFields() + slices.Sort(keyFields) + assert.Equal(t, d.ExpectedKeyFields, keyFields) + // we can only test when we don't expect to keep the trace if !d.ExpectedKeep { assert.Equal(t, d.ExpectedKeep, keep, d.Rules) @@ -966,8 +998,9 @@ func TestRulesWithNestedFields(t *testing.T) { }, }, }, - ExpectedKeep: true, - ExpectedRate: 10, + ExpectedKeep: true, + ExpectedRate: 10, + ExpectedKeyFields: []string{"test.test1"}, }, { Rules: &config.RulesBasedSamplerConfig{ @@ -995,8 +1028,9 @@ func TestRulesWithNestedFields(t *testing.T) { }, }, }, - ExpectedKeep: true, - ExpectedRate: 10, + ExpectedKeep: true, + ExpectedRate: 10, + ExpectedKeyFields: []string{"test.test1"}, }, { Rules: &config.RulesBasedSamplerConfig{ @@ -1025,8 +1059,9 @@ func TestRulesWithNestedFields(t *testing.T) { }, }, }, - ExpectedKeep: true, - ExpectedRate: 4, + ExpectedKeep: true, + ExpectedRate: 4, + ExpectedKeyFields: []string{"test.test1"}, }, { Rules: &config.RulesBasedSamplerConfig{ @@ -1055,9 +1090,10 @@ func TestRulesWithNestedFields(t *testing.T) { }, }, }, - ExpectedKeep: true, - ExpectedRate: 1, - ExpectedName: "no rule matched", + ExpectedKeep: true, + ExpectedRate: 1, + ExpectedName: "no rule matched", + ExpectedKeyFields: []string{"test.test1"}, }, { Rules: &config.RulesBasedSamplerConfig{ @@ -1087,8 +1123,9 @@ func TestRulesWithNestedFields(t *testing.T) { }, }, }, - ExpectedKeep: true, - ExpectedRate: 10, + ExpectedKeep: true, + ExpectedRate: 10, + ExpectedKeyFields: []string{"test.test1", "test.test2"}, }, } @@ -1112,6 +1149,7 @@ func TestRulesWithNestedFields(t *testing.T) { trace.AddSpan(span) } + sampler.Start() rate, keep, reason, key := sampler.GetSampleRate(trace) assert.Equal(t, d.ExpectedRate, rate, d.Rules) @@ -1122,6 +1160,9 @@ func TestRulesWithNestedFields(t *testing.T) { assert.Contains(t, reason, name) assert.Equal(t, "", key) + keyFields := sampler.GetKeyFields() + slices.Sort(keyFields) + assert.Equal(t, d.ExpectedKeyFields, keyFields) // we can only test when we don't expect to keep the trace if !d.ExpectedKeep { assert.Equal(t, d.ExpectedKeep, keep, d.Rules) @@ -1171,8 +1212,9 @@ func TestRulesWithDynamicSampler(t *testing.T) { }, }, }, - ExpectedKeep: true, - ExpectedRate: 10, + ExpectedKeep: true, + ExpectedRate: 10, + ExpectedKeyFields: []string{"http.status_code", "rule_test"}, }, } @@ -1206,6 +1248,10 @@ func TestRulesWithDynamicSampler(t *testing.T) { assert.Contains(t, reason, name) assert.Equal(t, "200•,", key) + keyFields := sampler.GetKeyFields() + slices.Sort(keyFields) + assert.Equal(t, d.ExpectedKeyFields, keyFields) + // we can only test when we don't expect to keep the trace if !d.ExpectedKeep { assert.Equal(t, d.ExpectedKeep, keep, d.Rules) @@ -1257,8 +1303,9 @@ func TestRulesWithEMADynamicSampler(t *testing.T) { }, }, }, - ExpectedKeep: true, - ExpectedRate: 10, + ExpectedKeep: true, + ExpectedRate: 10, + ExpectedKeyFields: []string{"http.status_code", "rule_test"}, }, } @@ -1292,6 +1339,9 @@ func TestRulesWithEMADynamicSampler(t *testing.T) { assert.Contains(t, reason, name) assert.Equal(t, "200•,", key) + keyFields := sampler.GetKeyFields() + slices.Sort(keyFields) + assert.Equal(t, d.ExpectedKeyFields, keyFields) // we can only test when we don't expect to keep the trace if !d.ExpectedKeep { assert.Equal(t, d.ExpectedKeep, keep, d.Rules) diff --git a/sample/sample.go b/sample/sample.go index b1b0f44490..0f4a1312e6 100644 --- a/sample/sample.go +++ b/sample/sample.go @@ -14,6 +14,7 @@ import ( type Sampler interface { GetSampleRate(trace *types.Trace) (rate uint, keep bool, reason string, key string) + GetKeyFields() []string Start() error } diff --git a/sample/totalthroughput.go b/sample/totalthroughput.go index e273c14413..9f47c7b41d 100644 --- a/sample/totalthroughput.go +++ b/sample/totalthroughput.go @@ -25,7 +25,8 @@ type TotalThroughputSampler struct { prefix string lastMetrics map[string]int64 - key *traceKey + key *traceKey + keyFields []string dynsampler *dynsampler.TotalThroughput } @@ -52,6 +53,7 @@ func (d *TotalThroughputSampler) Start() error { d.maxKeys = 500 } d.prefix = "totalthroughput_" + d.keyFields = d.Config.GetSamplingFields() // spin up the actual dynamic sampler d.dynsampler = &dynsampler.TotalThroughput{ @@ -116,3 +118,7 @@ func (d *TotalThroughputSampler) GetSampleRate(trace *types.Trace) (rate uint, k } return rate, shouldKeep, "totalthroughput", key } + +func (d *TotalThroughputSampler) GetKeyFields() []string { + return d.keyFields +} diff --git a/sample/windowed_throughput.go b/sample/windowed_throughput.go index d9eb30f15a..51220da8f1 100644 --- a/sample/windowed_throughput.go +++ b/sample/windowed_throughput.go @@ -26,7 +26,8 @@ type WindowedThroughputSampler struct { prefix string lastMetrics map[string]int64 - key *traceKey + key *traceKey + keyFields []string dynsampler *dynsampler.WindowedThroughput } @@ -47,6 +48,7 @@ func (d *WindowedThroughputSampler) Start() error { d.maxKeys = 500 } d.prefix = "windowedthroughput_" + d.keyFields = d.Config.GetSamplingFields() // spin up the actual dynamic sampler d.dynsampler = &dynsampler.WindowedThroughput{ @@ -112,3 +114,6 @@ func (d *WindowedThroughputSampler) GetSampleRate(trace *types.Trace) (rate uint } return rate, shouldKeep, "Windowedthroughput", key } +func (d *WindowedThroughputSampler) GetKeyFields() []string { + return d.keyFields +} From f8388c8940349eca0caebbeaace6818780b0fb6a Mon Sep 17 00:00:00 2001 From: Mike Goldsmith Date: Mon, 16 Sep 2024 17:06:28 +0100 Subject: [PATCH 02/25] fix: Use peer transmission during redistribute and shutdown events (#1332) ## Which problem is this PR solving? Updates the collector use the configured peer transmission to send spans to peers during redistribute and shutdown events. This ensures metrics for sending to Honeycomb and peers are accurate. The peer transmission is already configured and used by the router when receiving spans that should be sent to a peer. This is used the same transmission object after a span was originally accepted but then should be forwarded to a peer. - Closes #1324 ## Short description of the changes - Inject the already configured peer transmission during collector init - Update redistribute and shutdown logic to use the peer transmission - Update tests throughout to verify the peer transmission is used correctly and also verify it can be injected correctly - note: there were existing tests to verify redistribute and shutdown behaviour and have been updated --- collect/collect.go | 15 ++--- collect/collect_test.go | 131 ++++++++++++++++++++++++++-------------- 2 files changed, 95 insertions(+), 51 deletions(-) diff --git a/collect/collect.go b/collect/collect.go index 20002b8211..b88bbe18c1 100644 --- a/collect/collect.go +++ b/collect/collect.go @@ -67,11 +67,12 @@ type InMemCollector struct { Health health.Recorder `inject:""` Sharder sharder.Sharder `inject:""` - Transmission transmit.Transmission `inject:"upstreamTransmission"` - Metrics metrics.Metrics `inject:"genericMetrics"` - SamplerFactory *sample.SamplerFactory `inject:""` - StressRelief StressReliever `inject:"stressRelief"` - Peers peer.Peers `inject:""` + Transmission transmit.Transmission `inject:"upstreamTransmission"` + PeerTransmission transmit.Transmission `inject:"peerTransmission"` + Metrics metrics.Metrics `inject:"genericMetrics"` + SamplerFactory *sample.SamplerFactory `inject:""` + StressRelief StressReliever `inject:"stressRelief"` + Peers peer.Peers `inject:""` // For test use only BlockOnAddSpan bool @@ -433,7 +434,7 @@ func (i *InMemCollector) redistributeTraces() { sp.Data["meta.refinery.forwarded"] = i.hostname } - i.Transmission.EnqueueSpan(sp) + i.PeerTransmission.EnqueueSpan(sp) } forwardedTraces.Add(trace.TraceID) @@ -1033,7 +1034,7 @@ func (i *InMemCollector) sendSpansOnShutdown(ctx context.Context, sentSpanChan < sp.Data["meta.refinery.forwarded"] = i.hostname } - i.Transmission.EnqueueSpan(sp) + i.PeerTransmission.EnqueueSpan(sp) _, exist := forwardedTraces[sp.TraceID] if !exist { forwardedTraces[sp.TraceID] = struct{}{} diff --git a/collect/collect_test.go b/collect/collect_test.go index fac99fa6c0..68ee4020f9 100644 --- a/collect/collect_test.go +++ b/collect/collect_test.go @@ -40,7 +40,7 @@ func newCache() (cache.TraceSentCache, error) { return cache.NewCuckooSentCache(cfg, &metrics.NullMetrics{}) } -func newTestCollector(conf config.Config, transmission transmit.Transmission) *InMemCollector { +func newTestCollector(conf config.Config, transmission transmit.Transmission, peerTransmission transmit.Transmission) *InMemCollector { s := &metrics.MockMetrics{} s.Start() clock := clockwork.NewRealClock() @@ -50,14 +50,15 @@ func newTestCollector(conf config.Config, transmission transmit.Transmission) *I healthReporter.Start() return &InMemCollector{ - Config: conf, - Clock: clock, - Logger: &logger.NullLogger{}, - Tracer: noop.NewTracerProvider().Tracer("test"), - Health: healthReporter, - Transmission: transmission, - Metrics: &metrics.NullMetrics{}, - StressRelief: &MockStressReliever{}, + Config: conf, + Clock: clock, + Logger: &logger.NullLogger{}, + Tracer: noop.NewTracerProvider().Tracer("test"), + Health: healthReporter, + Transmission: transmission, + PeerTransmission: peerTransmission, + Metrics: &metrics.NullMetrics{}, + StressRelief: &MockStressReliever{}, SamplerFactory: &sample.SamplerFactory{ Config: conf, Metrics: s, @@ -94,7 +95,9 @@ func TestAddRootSpan(t *testing.T) { } transmission := &transmit.MockTransmission{} transmission.Start() - coll := newTestCollector(conf, transmission) + peerTransmission := &transmit.MockTransmission{} + peerTransmission.Start() + coll := newTestCollector(conf, transmission, peerTransmission) c := cache.NewInMemCache(3, &metrics.NullMetrics{}, &logger.NullLogger{}) coll.cache = c @@ -176,7 +179,9 @@ func TestOriginalSampleRateIsNotedInMetaField(t *testing.T) { } transmission := &transmit.MockTransmission{} transmission.Start() - coll := newTestCollector(conf, transmission) + peerTransmission := &transmit.MockTransmission{} + peerTransmission.Start() + coll := newTestCollector(conf, transmission, peerTransmission) c := cache.NewInMemCache(3, &metrics.NullMetrics{}, &logger.NullLogger{}) coll.cache = c @@ -264,7 +269,9 @@ func TestTransmittedSpansShouldHaveASampleRateOfAtLeastOne(t *testing.T) { } transmission := &transmit.MockTransmission{} transmission.Start() - coll := newTestCollector(conf, transmission) + peerTransmission := &transmit.MockTransmission{} + peerTransmission.Start() + coll := newTestCollector(conf, transmission, peerTransmission) c := cache.NewInMemCache(3, &metrics.NullMetrics{}, &logger.NullLogger{}) coll.cache = c @@ -329,7 +336,9 @@ func TestAddSpan(t *testing.T) { } transmission := &transmit.MockTransmission{} transmission.Start() - coll := newTestCollector(conf, transmission) + peerTransmission := &transmit.MockTransmission{} + peerTransmission.Start() + coll := newTestCollector(conf, transmission, peerTransmission) c := cache.NewInMemCache(3, &metrics.NullMetrics{}, &logger.NullLogger{}) coll.cache = c @@ -399,7 +408,9 @@ func TestDryRunMode(t *testing.T) { } transmission := &transmit.MockTransmission{} transmission.Start() - coll := newTestCollector(conf, transmission) + peerTransmission := &transmit.MockTransmission{} + peerTransmission.Start() + coll := newTestCollector(conf, transmission, peerTransmission) samplerFactory := &sample.SamplerFactory{ Config: conf, @@ -538,7 +549,9 @@ func TestCacheSizeReload(t *testing.T) { transmission := &transmit.MockTransmission{} transmission.Start() - coll := newTestCollector(conf, transmission) + peerTransmission := &transmit.MockTransmission{} + peerTransmission.Start() + coll := newTestCollector(conf, transmission, peerTransmission) coll.Peers = &peer.MockPeers{} err := coll.Start() @@ -613,7 +626,9 @@ func TestSampleConfigReload(t *testing.T) { transmission := &transmit.MockTransmission{} transmission.Start() - coll := newTestCollector(conf, transmission) + peerTransmission := &transmit.MockTransmission{} + peerTransmission.Start() + coll := newTestCollector(conf, transmission, peerTransmission) err := coll.Start() assert.NoError(t, err) @@ -684,7 +699,9 @@ func TestStableMaxAlloc(t *testing.T) { transmission := &transmit.MockTransmission{} transmission.Start() - coll := newTestCollector(conf, transmission) + peerTransmission := &transmit.MockTransmission{} + peerTransmission.Start() + coll := newTestCollector(conf, transmission, peerTransmission) spandata := make([]map[string]interface{}, 500) for i := 0; i < 500; i++ { @@ -781,7 +798,9 @@ func TestAddSpanNoBlock(t *testing.T) { transmission := &transmit.MockTransmission{} transmission.Start() - coll := newTestCollector(conf, transmission) + peerTransmission := &transmit.MockTransmission{} + peerTransmission.Start() + coll := newTestCollector(conf, transmission, peerTransmission) c := cache.NewInMemCache(10, &metrics.NullMetrics{}, &logger.NullLogger{}) coll.cache = c @@ -826,6 +845,7 @@ func TestDependencyInjection(t *testing.T) { &inject.Object{Value: &health.Health{}}, &inject.Object{Value: &sharder.SingleServerSharder{}}, &inject.Object{Value: &transmit.MockTransmission{}, Name: "upstreamTransmission"}, + &inject.Object{Value: &transmit.MockTransmission{}, Name: "peerTransmission"}, &inject.Object{Value: &metrics.NullMetrics{}, Name: "genericMetrics"}, &inject.Object{Value: &sample.SamplerFactory{}}, &inject.Object{Value: &MockStressReliever{}, Name: "stressRelief"}, @@ -862,7 +882,9 @@ func TestAddCountsToRoot(t *testing.T) { transmission := &transmit.MockTransmission{} transmission.Start() - coll := newTestCollector(conf, transmission) + peerTransmission := &transmit.MockTransmission{} + peerTransmission.Start() + coll := newTestCollector(conf, transmission, peerTransmission) c := cache.NewInMemCache(3, &metrics.NullMetrics{}, &logger.NullLogger{}) coll.cache = c @@ -949,7 +971,9 @@ func TestLateRootGetsCounts(t *testing.T) { transmission := &transmit.MockTransmission{} transmission.Start() - coll := newTestCollector(conf, transmission) + peerTransmission := &transmit.MockTransmission{} + peerTransmission.Start() + coll := newTestCollector(conf, transmission, peerTransmission) c := cache.NewInMemCache(3, &metrics.NullMetrics{}, &logger.NullLogger{}) coll.cache = c @@ -1036,7 +1060,9 @@ func TestAddSpanCount(t *testing.T) { } transmission := &transmit.MockTransmission{} transmission.Start() - coll := newTestCollector(conf, transmission) + peerTransmission := &transmit.MockTransmission{} + peerTransmission.Start() + coll := newTestCollector(conf, transmission, peerTransmission) c := cache.NewInMemCache(3, &metrics.NullMetrics{}, &logger.NullLogger{}) coll.cache = c @@ -1107,7 +1133,9 @@ func TestLateRootGetsSpanCount(t *testing.T) { } transmission := &transmit.MockTransmission{} transmission.Start() - coll := newTestCollector(conf, transmission) + peerTransmission := &transmit.MockTransmission{} + peerTransmission.Start() + coll := newTestCollector(conf, transmission, peerTransmission) c := cache.NewInMemCache(3, &metrics.NullMetrics{}, &logger.NullLogger{}) coll.cache = c @@ -1182,7 +1210,9 @@ func TestLateSpanNotDecorated(t *testing.T) { transmission := &transmit.MockTransmission{} transmission.Start() - coll := newTestCollector(conf, transmission) + peerTransmission := &transmit.MockTransmission{} + peerTransmission.Start() + coll := newTestCollector(conf, transmission, peerTransmission) c := cache.NewInMemCache(3, &metrics.NullMetrics{}, &logger.NullLogger{}) coll.cache = c @@ -1249,7 +1279,9 @@ func TestAddAdditionalAttributes(t *testing.T) { } transmission := &transmit.MockTransmission{} transmission.Start() - coll := newTestCollector(conf, transmission) + peerTransmission := &transmit.MockTransmission{} + peerTransmission.Start() + coll := newTestCollector(conf, transmission, peerTransmission) c := cache.NewInMemCache(3, &metrics.NullMetrics{}, &logger.NullLogger{}) coll.cache = c @@ -1313,7 +1345,9 @@ func TestStressReliefSampleRate(t *testing.T) { transmission := &transmit.MockTransmission{} transmission.Start() - coll := newTestCollector(conf, transmission) + peerTransmission := &transmit.MockTransmission{} + peerTransmission.Start() + coll := newTestCollector(conf, transmission, peerTransmission) stc, err := newCache() assert.NoError(t, err, "lru cache should start") @@ -1400,7 +1434,9 @@ func TestStressReliefDecorateHostname(t *testing.T) { transmission := &transmit.MockTransmission{} transmission.Start() - coll := newTestCollector(conf, transmission) + peerTransmission := &transmit.MockTransmission{} + peerTransmission.Start() + coll := newTestCollector(conf, transmission, peerTransmission) coll.hostname = "host123" c := cache.NewInMemCache(3, &metrics.NullMetrics{}, &logger.NullLogger{}) @@ -1503,7 +1539,9 @@ func TestSpanWithRuleReasons(t *testing.T) { transmission := &transmit.MockTransmission{} transmission.Start() - coll := newTestCollector(conf, transmission) + peerTransmission := &transmit.MockTransmission{} + peerTransmission.Start() + coll := newTestCollector(conf, transmission, peerTransmission) c := cache.NewInMemCache(3, &metrics.NullMetrics{}, &logger.NullLogger{}) coll.cache = c @@ -1671,7 +1709,9 @@ func TestRedistributeTraces(t *testing.T) { transmission := &transmit.MockTransmission{} transmission.Start() - coll := newTestCollector(conf, transmission) + peerTransmission := &transmit.MockTransmission{} + peerTransmission.Start() + coll := newTestCollector(conf, transmission, peerTransmission) s := &sharder.MockSharder{ Self: &sharder.TestShard{Addr: "api1"}, } @@ -1702,7 +1742,7 @@ func TestRedistributeTraces(t *testing.T) { return len(transmission.Events) == 1 && transmission.Events[0].APIHost == "api1" }, conf.GetTracesConfig().GetTraceTimeout()*2, conf.GetTracesConfig().GetSendTickerValue()) - transmission.Flush() + peerTransmission.Flush() s.Other = &sharder.TestShard{Addr: "api2"} span = &types.Span{ @@ -1726,13 +1766,13 @@ func TestRedistributeTraces(t *testing.T) { coll.Peers.RegisterUpdatedPeersCallback(coll.redistributeTimer.Reset) assert.Eventually(t, func() bool { - transmission.Mux.Lock() - defer transmission.Mux.Unlock() - if len(transmission.Events) == 0 { + peerTransmission.Mux.Lock() + defer peerTransmission.Mux.Unlock() + if len(peerTransmission.Events) == 0 { return false } - return len(transmission.Events) == 1 && transmission.Events[0].APIHost == "api2" + return len(peerTransmission.Events) == 1 && peerTransmission.Events[0].APIHost == "api2" }, conf.GetTracesConfig().GetTraceTimeout()*2, conf.GetTracesConfig().GetSendTickerValue()) } @@ -1754,7 +1794,9 @@ func TestDrainTracesOnShutdown(t *testing.T) { } transmission := &transmit.MockTransmission{} transmission.Start() - coll := newTestCollector(conf, transmission) + peerTransmission := &transmit.MockTransmission{} + peerTransmission.Start() + coll := newTestCollector(conf, transmission, peerTransmission) coll.hostname = "host123" coll.Sharder = &sharder.MockSharder{ Self: &sharder.TestShard{Addr: "api1"}, @@ -1797,10 +1839,9 @@ func TestDrainTracesOnShutdown(t *testing.T) { go coll.sendSpansOnShutdown(ctx1, sentTraceChan, forwardTraceChan) require.EventuallyWithT(t, func(collect *assert.CollectT) { transmission.Mux.Lock() - events := transmission.Events - require.Len(collect, events, 1) - require.Equal(collect, span1.Dataset, events[0].Dataset) - transmission.Mux.Unlock() + defer transmission.Mux.Unlock() + require.Len(collect, transmission.Events, 1) + require.Equal(collect, span1.Dataset, transmission.Events[0].Dataset) }, 2*time.Second, 100*time.Millisecond) cancel1() @@ -1824,11 +1865,11 @@ func TestDrainTracesOnShutdown(t *testing.T) { ctx2, cancel2 := context.WithCancel(context.Background()) go coll.sendSpansOnShutdown(ctx2, sentTraceChan, forwardTraceChan) require.EventuallyWithT(t, func(collect *assert.CollectT) { - transmission.Mux.Lock() - require.Len(collect, transmission.Events, 1) - require.Equal(collect, span2.Dataset, transmission.Events[0].Dataset) - require.Equal(collect, "api2", transmission.Events[0].APIHost) - transmission.Mux.Unlock() + peerTransmission.Mux.Lock() + defer peerTransmission.Mux.Unlock() + require.Len(collect, peerTransmission.Events, 1) + require.Equal(collect, span2.Dataset, peerTransmission.Events[0].Dataset) + require.Equal(collect, "api2", peerTransmission.Events[0].APIHost) }, 2*time.Second, 100*time.Millisecond) cancel2() } @@ -1852,7 +1893,9 @@ func TestBigTracesGoEarly(t *testing.T) { transmission := &transmit.MockTransmission{} transmission.Start() - coll := newTestCollector(conf, transmission) + peerTransmission := &transmit.MockTransmission{} + peerTransmission.Start() + coll := newTestCollector(conf, transmission, peerTransmission) c := cache.NewInMemCache(3, &metrics.NullMetrics{}, &logger.NullLogger{}) coll.cache = c From f6589e1af9b4c29a88d6208c16b4f6d56ac36554 Mon Sep 17 00:00:00 2001 From: Mike Goldsmith Date: Mon, 16 Sep 2024 17:15:19 +0100 Subject: [PATCH 03/25] feat: Improve log messages to be more informative (#1322) ## Which problem is this PR solving? Improve some log messages to be more accurate about what's actually happening. - closes #1321 ## Short description of the changes - Improve log messages - Prefer using WithField / WithString to include parameters instead of inline formatting - Update memory overrun messages from error -> warn - they do not constitute data lost so error feels heavy handed --------- Co-authored-by: Kent Quirk --- cmd/refinery/main.go | 2 +- collect/collect.go | 13 +++++++------ collect/stressRelief.go | 2 +- metrics/otel_metrics.go | 10 +++++----- sharder/deterministic.go | 2 +- 5 files changed, 15 insertions(+), 14 deletions(-) diff --git a/cmd/refinery/main.go b/cmd/refinery/main.go index 24b45b3da6..d708c7915a 100644 --- a/cmd/refinery/main.go +++ b/cmd/refinery/main.go @@ -334,5 +334,5 @@ func main() { // unregister ourselves before we go close(done) time.Sleep(100 * time.Millisecond) - a.Logger.Error().Logf("Caught signal \"%s\"", sig) + a.Logger.Error().WithField("signal", sig).Logf("Caught OS signal") } diff --git a/collect/collect.go b/collect/collect.go index b88bbe18c1..4375c8b757 100644 --- a/collect/collect.go +++ b/collect/collect.go @@ -30,7 +30,7 @@ import ( "github.com/sirupsen/logrus" ) -var ErrWouldBlock = errors.New("not adding span, channel buffer is full") +var ErrWouldBlock = errors.New("Dropping span as channel buffer is full. Span will not be processed and will be lost.") var CollectorHealthKey = "collector" type Collector interface { @@ -265,13 +265,13 @@ func (i *InMemCollector) checkAlloc() { i.cache.RemoveTraces(tracesSent) // Treat any MaxAlloc overage as an error so we know it's happening - i.Logger.Error(). + i.Logger.Warn(). WithField("cache_size", cap). WithField("alloc", mem.Alloc). WithField("num_traces_sent", len(tracesSent)). WithField("datasize_sent", totalDataSizeSent). WithField("new_trace_count", i.cache.GetCacheCapacity()). - Logf("evicting large traces early due to memory overage") + Logf("Making some trace decisions early due to memory overrun.") // Manually GC here - without this we can easily end up evicting more than we // need to, since total alloc won't be updated until after a GC pass. @@ -799,7 +799,7 @@ func (i *InMemCollector) send(trace *types.Trace, sendReason string) { // if we're supposed to drop this trace, and dry run mode is not enabled, then we're done. if !shouldSend && !i.Config.GetIsDryRun() { i.Metrics.Increment("trace_send_dropped") - i.Logger.Info().WithFields(logFields).Logf("Dropping trace because of sampling") + i.Logger.Info().WithFields(logFields).Logf("Dropping trace because of sampling decision") return } i.Metrics.Increment("trace_send_kept") @@ -808,9 +808,10 @@ func (i *InMemCollector) send(trace *types.Trace, sendReason string) { // ok, we're not dropping this trace; send all the spans if i.Config.GetIsDryRun() && !shouldSend { - i.Logger.Info().WithFields(logFields).Logf("Trace would have been dropped, but dry run mode is enabled") + i.Logger.Info().WithFields(logFields).Logf("Trace would have been dropped, but sending because dry run mode is enabled") + } else { + i.Logger.Info().WithFields(logFields).Logf("Sending trace") } - i.Logger.Info().WithFields(logFields).Logf("Sending trace") for _, sp := range trace.GetSpans() { if i.Config.GetAddRuleReasonToTrace() { sp.Data["meta.refinery.reason"] = reason diff --git a/collect/stressRelief.go b/collect/stressRelief.go index 4e77caa265..893d43b7ba 100644 --- a/collect/stressRelief.go +++ b/collect/stressRelief.go @@ -255,7 +255,7 @@ func (s *StressRelief) UpdateFromConfig(cfg config.StressReliefConfig) { s.mode = Always default: // validation shouldn't let this happen but we'll be safe... s.mode = Never - s.Logger.Error().Logf("StressRelief mode is '%s' which shouldn't happen", cfg.Mode) + s.Logger.Error().WithString("mode", cfg.Mode).Logf("Invalid StressRelief mode") } s.Logger.Debug().WithField("mode", s.mode).Logf("setting StressRelief mode") diff --git a/metrics/otel_metrics.go b/metrics/otel_metrics.go index 1fc12899ae..7688e1ecf0 100644 --- a/metrics/otel_metrics.go +++ b/metrics/otel_metrics.go @@ -189,7 +189,7 @@ func (o *OTelMetrics) Register(name string, metricType string) { case "counter": ctr, err := o.meter.Int64Counter(name) if err != nil { - o.Logger.Error().WithString("msg", "failed to create counter").WithString("name", name) + o.Logger.Error().WithString("name", name).Logf("failed to create counter") return } o.counters[name] = ctr @@ -210,26 +210,26 @@ func (o *OTelMetrics) Register(name string, metricType string) { metric.WithFloat64Callback(f), ) if err != nil { - o.Logger.Error().WithString("msg", "failed to create gauge").WithString("name", name) + o.Logger.Error().WithString("name", name).Logf("failed to create gauge") return } o.gauges[name] = g case "histogram": h, err := o.meter.Float64Histogram(name) if err != nil { - o.Logger.Error().WithString("msg", "failed to create histogram").WithString("name", name) + o.Logger.Error().WithString("name", name).Logf("failed to create histogram") return } o.histograms[name] = h case "updown": ud, err := o.meter.Int64UpDownCounter(name) if err != nil { - o.Logger.Error().WithString("msg", "failed to create updown").WithString("name", name) + o.Logger.Error().WithString("name", name).Logf("failed to create updown counter") return } o.updowns[name] = ud default: - o.Logger.Error().WithString("msg", "unknown metric type").WithString("type", metricType) + o.Logger.Error().WithString("type", metricType).Logf("unknown metric type") return } } diff --git a/sharder/deterministic.go b/sharder/deterministic.go index a1ae132748..48912d153f 100644 --- a/sharder/deterministic.go +++ b/sharder/deterministic.go @@ -133,7 +133,7 @@ func (d *DeterministicSharder) Start() error { time.Sleep(5 * time.Second) } - d.Logger.Error().WithFields(map[string]interface{}{"peers": d.peers, "self": self}).Logf("list of current peers") + d.Logger.Error().WithFields(map[string]interface{}{"peers": d.peers, "self": self}).Logf("failed to find self in the peer list") return errors.New("failed to find self in the peer list") } From 19d24ee55bd0171fd636b15fed4499e1ffc3bf87 Mon Sep 17 00:00:00 2001 From: Yingrong Zhao <22300958+VinozzZ@users.noreply.github.com> Date: Mon, 16 Sep 2024 14:03:30 -0400 Subject: [PATCH 04/25] fix: remove unnecessary assertion to any (#1333) ## Which problem is this PR solving? - in #1329 , we introduced the assertion to `any` as the intermediate step to check whether a Sampler is a ClusterSizer. - It turns out that this intermediate step is unnecessary. ## Short description of the changes - remove assertion to `any` in `SetClusterSize` --- sample/rules.go | 5 +---- sample/sample.go | 8 ++++---- 2 files changed, 5 insertions(+), 8 deletions(-) diff --git a/sample/rules.go b/sample/rules.go index 98c2f5df56..cd87f8f3a8 100644 --- a/sample/rules.go +++ b/sample/rules.go @@ -85,10 +85,7 @@ func (s *RulesBasedSampler) Start() error { func (s *RulesBasedSampler) SetClusterSize(size int) { for _, sampler := range s.samplers { - // Sampler does not implement ClusterSizer. - // By asserting Sampler to an empty interface, we will have access to the underlying pointer. - // We can then assert that pointer to the ClusterSizer. - if sampler, ok := sampler.(any).(ClusterSizer); ok { + if sampler, ok := sampler.(ClusterSizer); ok { sampler.SetClusterSize(size) } } diff --git a/sample/sample.go b/sample/sample.go index 0f4a1312e6..0e7d6e9c63 100644 --- a/sample/sample.go +++ b/sample/sample.go @@ -43,11 +43,11 @@ func (s *SamplerFactory) updatePeerCounts() { // all the samplers who want it should use the stored count for _, sampler := range s.samplers { - // Sampler does not implement ClusterSizer. - // By asserting Sampler to an empty interface, we will have access to the underlying pointer. - // We can then assert that pointer to the ClusterSizer. - if clusterSizer, ok := sampler.(any).(ClusterSizer); ok { + if clusterSizer, ok := sampler.(ClusterSizer); ok { + s.Logger.Warn().Logf("set cluster size to %d", s.peerCount) clusterSizer.SetClusterSize(s.peerCount) + } else { + s.Logger.Warn().Logf("sampler does not implement ClusterSizer") } } } From 4625f135e0d5c54ad1efb6c20929977d12b6445a Mon Sep 17 00:00:00 2001 From: Mike Goldsmith Date: Mon, 16 Sep 2024 21:01:40 +0100 Subject: [PATCH 05/25] feat: Update Honeycomb logger to use EMAThroughput sampler (#1328) ## Which problem is this PR solving? Updates the Honeycomb logger to use the [EMAThroughput](https://github.com/honeycombio/dynsampler-go/blob/main/emathroughput.go#L77) sampler. This sampler has built-in support for burst protection that the [PerKeyThroughput](https://github.com/honeycombio/dynsampler-go/blob/main/perkeythroughput.go#L17) sampler does not. Burst protection would be useful when a cluster node does down unexpectedly because the other nodes in the cluster will fail to make peer requests until it is removed from the cluster. This can result in a very high number of "failed to send" log messages in a very small window, faster than what the PerKeyThroughput sampler can adjust the sample rate which results in all log messages being sent. The burst protection from the EMAThroughput sampler would help here as it will schedule an update to sample rates if a high number of events are received very quickly, allowing it to react quicker. This is a like for like replacement, and such I haven't added any additional configuration options the new sampler supports. We can add more later if desired. ## Short description of the changes - Replace the Honeycomb logger sampler with the EMAThroughput sampler. --- logger/honeycomb.go | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/logger/honeycomb.go b/logger/honeycomb.go index fa77cd1217..5c0ab9954f 100644 --- a/logger/honeycomb.go +++ b/logger/honeycomb.go @@ -57,10 +57,10 @@ func (h *HoneycombLogger) Start() error { } if loggerConfig.GetSamplerEnabled() { - h.sampler = &dynsampler.PerKeyThroughput{ - ClearFrequencyDuration: 10 * time.Second, - PerKeyThroughputPerSec: loggerConfig.SamplerThroughput, - MaxKeys: 1000, + h.sampler = &dynsampler.EMAThroughput{ + AdjustmentInterval: 30 * time.Second, + GoalThroughputPerSec: loggerConfig.SamplerThroughput, + MaxKeys: 1000, } err := h.sampler.Start() if err != nil { From 5f8b2e639364fb8826eedca970c77976f2095a79 Mon Sep 17 00:00:00 2001 From: Yingrong Zhao <22300958+VinozzZ@users.noreply.github.com> Date: Fri, 20 Sep 2024 14:13:56 -0400 Subject: [PATCH 06/25] maint: rename sent_reason_cache to kept_reason_cache (#1346) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit ## Which problem is this PR solving? In the `collector`, we use `sendReason` to describe why a trace is moved into the decision-making process. Additionally, we have `sentReason`, which indicates the reason behind a trace receiving a “kept” decision. To avoid confusion between these terms when sharing this information from the decider node to the rest of the cluster, I propose renaming `sentReasonCache` to `keptReasonCache` to better reflect its purpose. ## Short description of the changes - rename all `sentReasonCache` to `keptReasonCache` --- collect/cache/cuckooSentCache.go | 16 +++++++-------- ...ache_test.go => kept_reason_cache_test.go} | 20 +++++++++---------- ...reasons_cache.go => kept_reasons_cache.go} | 12 +++++------ collect/collect.go | 12 +++++------ types/event.go | 10 +++++----- 5 files changed, 35 insertions(+), 35 deletions(-) rename collect/cache/{sent_reason_cache_test.go => kept_reason_cache_test.go} (87%) rename collect/cache/{sent_reasons_cache.go => kept_reasons_cache.go} (79%) diff --git a/collect/cache/cuckooSentCache.go b/collect/cache/cuckooSentCache.go index 281a3178ba..df592c261c 100644 --- a/collect/cache/cuckooSentCache.go +++ b/collect/cache/cuckooSentCache.go @@ -38,8 +38,8 @@ type KeptTrace interface { SpanEventCount() uint32 SpanLinkCount() uint32 SpanCount() uint32 - SetSentReason(uint) - SentReason() uint + SetKeptReason(uint) + KeptReason() uint } func NewKeptTraceCacheEntry(t KeptTrace) *keptTraceCacheEntry { @@ -53,7 +53,7 @@ func NewKeptTraceCacheEntry(t KeptTrace) *keptTraceCacheEntry { spanEventCount: t.SpanEventCount(), spanLinkCount: t.SpanLinkCount(), spanCount: t.SpanCount(), - reason: uint32(t.SentReason()), + reason: uint32(t.KeptReason()), } } @@ -155,7 +155,7 @@ type cuckooSentCache struct { // This mutex is for managing kept traces keptMut sync.Mutex - sentReasons *SentReasonsCache + keptReasons *KeptReasonsCache } // Make sure it implements TraceSentCache @@ -188,7 +188,7 @@ func NewCuckooSentCache(cfg config.SampleCacheConfig, met metrics.Metrics) (Trac dropped: dropped, recentDroppedIDs: recentDroppedIDs, cfg: cfg, - sentReasons: NewSentReasonsCache(met), + keptReasons: NewKeptReasonsCache(met), done: make(chan struct{}), } go cache.monitor() @@ -220,7 +220,7 @@ func (c *cuckooSentCache) Stop() { func (c *cuckooSentCache) Record(trace KeptTrace, keep bool, reason string) { if keep { // record this decision in the sent record LRU for future spans - trace.SetSentReason(c.sentReasons.Set(reason)) + trace.SetKeptReason(c.keptReasons.Set(reason)) sentRecord := NewKeptTraceCacheEntry(trace) c.keptMut.Lock() @@ -253,7 +253,7 @@ func (c *cuckooSentCache) CheckSpan(span *types.Span) (TraceSentRecord, string, if sentRecord, found := c.kept.Get(span.TraceID); found { // if we kept it, then this span being checked needs counting too sentRecord.Count(span) - reason, _ := c.sentReasons.Get(uint(sentRecord.reason)) + reason, _ := c.keptReasons.Get(uint(sentRecord.reason)) return sentRecord, reason, true } // we have no memory of this place @@ -306,7 +306,7 @@ func (c *cuckooSentCache) CheckTrace(traceID string) (TraceSentRecord, string, b c.keptMut.Lock() defer c.keptMut.Unlock() if sentRecord, found := c.kept.Get(traceID); found { - reason, _ := c.sentReasons.Get(uint(sentRecord.reason)) + reason, _ := c.keptReasons.Get(uint(sentRecord.reason)) return sentRecord, reason, true } // we have no memory of this place diff --git a/collect/cache/sent_reason_cache_test.go b/collect/cache/kept_reason_cache_test.go similarity index 87% rename from collect/cache/sent_reason_cache_test.go rename to collect/cache/kept_reason_cache_test.go index f46cc731f3..bfb10b4c1f 100644 --- a/collect/cache/sent_reason_cache_test.go +++ b/collect/cache/kept_reason_cache_test.go @@ -13,10 +13,10 @@ import ( "github.com/stretchr/testify/assert" ) -func TestSentReasonCache(t *testing.T) { +func TestKeptReasonCache(t *testing.T) { s := &metrics.MockMetrics{} s.Start() - c := cache.NewSentReasonsCache(s) + c := cache.NewKeptReasonsCache(s) keys := make([]uint, 0) entries := []string{"foo", "bar", "baz"} for _, item := range entries { @@ -29,7 +29,7 @@ func TestSentReasonCache(t *testing.T) { } } -func BenchmarkSentReasonCache_Set(b *testing.B) { +func BenchmarkKeptReasonCache_Set(b *testing.B) { s := &metrics.MockMetrics{} s.Start() for _, numItems := range []int{10, 100, 1000, 10000, 100000} { @@ -38,18 +38,18 @@ func BenchmarkSentReasonCache_Set(b *testing.B) { entries[i] = randomString(50) } b.Run(strconv.Itoa(numItems), func(b *testing.B) { - cache := cache.NewSentReasonsCache(s) + cache := cache.NewKeptReasonsCache(s) for i := 0; i < b.N; i++ { cache.Set(entries[seededRand.Intn(numItems)]) } }) } } -func BenchmarkSentReasonCache_Get(b *testing.B) { +func BenchmarkKeptReasonCache_Get(b *testing.B) { s := &metrics.MockMetrics{} s.Start() for _, numItems := range []int{10, 100, 1000, 10000, 100000} { - cache := cache.NewSentReasonsCache(s) + cache := cache.NewKeptReasonsCache(s) for i := 0; i < numItems; i++ { cache.Set(randomString(50)) } @@ -61,13 +61,13 @@ func BenchmarkSentReasonCache_Get(b *testing.B) { } } -func BenchmarkSentReasonsCache_Get_Parallel(b *testing.B) { +func BenchmarkKeptReasonsCache_Get_Parallel(b *testing.B) { for _, numGoroutines := range []int{1, 50, 300} { for _, numUniqueEntries := range []int{50, 500, 2000} { b.Run(fmt.Sprintf("entries%d-g%d", numUniqueEntries, numGoroutines), func(b *testing.B) { s := &metrics.MockMetrics{} s.Start() - cache := cache.NewSentReasonsCache(s) + cache := cache.NewKeptReasonsCache(s) entries := make([]string, numUniqueEntries) for i := 0; i < numUniqueEntries; i++ { @@ -96,7 +96,7 @@ func BenchmarkSentReasonsCache_Get_Parallel(b *testing.B) { } } -func BenchmarkSentReasonsCache_Set_Parallel(b *testing.B) { +func BenchmarkKeptReasonsCache_Set_Parallel(b *testing.B) { for _, numGoroutines := range []int{1, 50, 300} { for _, numUniqueEntries := range []int{50, 500, 2000} { b.Run(fmt.Sprintf("entries%d-g%d", numUniqueEntries, numGoroutines), func(b *testing.B) { @@ -106,7 +106,7 @@ func BenchmarkSentReasonsCache_Set_Parallel(b *testing.B) { for i := 0; i < numUniqueEntries; i++ { entries[i] = randomString(50) } - cache := cache.NewSentReasonsCache(s) + cache := cache.NewKeptReasonsCache(s) wg := sync.WaitGroup{} count := b.N / numGoroutines if count == 0 { diff --git a/collect/cache/sent_reasons_cache.go b/collect/cache/kept_reasons_cache.go similarity index 79% rename from collect/cache/sent_reasons_cache.go rename to collect/cache/kept_reasons_cache.go index 5b9e35bf6e..69469dfd7d 100644 --- a/collect/cache/sent_reasons_cache.go +++ b/collect/cache/kept_reasons_cache.go @@ -13,7 +13,7 @@ import ( // This is used to reduce the memory footprint of the trace cache. // It is not concurrency-safe. -type SentReasonsCache struct { +type KeptReasonsCache struct { Metrics metrics.Metrics data []string @@ -22,11 +22,11 @@ type SentReasonsCache struct { hashSeed uint64 } -// NewSentReasonsCache returns a new SentReasonsCache. -func NewSentReasonsCache(metrics metrics.Metrics) *SentReasonsCache { +// NewKeptReasonsCache returns a new SentReasonsCache. +func NewKeptReasonsCache(metrics metrics.Metrics) *KeptReasonsCache { metrics.Register("collect_sent_reasons_cache_entries", "histogram") - return &SentReasonsCache{ + return &KeptReasonsCache{ Metrics: metrics, keys: make(map[uint64]uint32), hashSeed: rand.Uint64(), @@ -35,7 +35,7 @@ func NewSentReasonsCache(metrics metrics.Metrics) *SentReasonsCache { // Set adds a new reason to the cache, returning the key. // The key is generated by incrementing a counter. -func (c *SentReasonsCache) Set(key string) uint { +func (c *KeptReasonsCache) Set(key string) uint { // generate a hash hash := wyhash.Hash([]byte(key), c.hashSeed) @@ -50,7 +50,7 @@ func (c *SentReasonsCache) Set(key string) uint { } // Get returns a reason from the cache, if it exists. -func (c *SentReasonsCache) Get(key uint) (string, bool) { +func (c *KeptReasonsCache) Get(key uint) (string, bool) { if key == 0 { return "", false } diff --git a/collect/collect.go b/collect/collect.go index 4375c8b757..fac8dad151 100644 --- a/collect/collect.go +++ b/collect/collect.go @@ -482,12 +482,12 @@ func (i *InMemCollector) processSpan(sp *types.Span) { trace := i.cache.Get(sp.TraceID) if trace == nil { // if the trace has already been sent, just pass along the span - if sr, sentReason, found := i.sampleTraceCache.CheckSpan(sp); found { + if sr, keptReason, found := i.sampleTraceCache.CheckSpan(sp); found { i.Metrics.Increment("trace_sent_cache_hit") // bump the count of records on this trace -- if the root span isn't // the last late span, then it won't be perfect, but it will be better than // having none at all - i.dealWithSentTrace(ctx, sr, sentReason, sp) + i.dealWithSentTrace(ctx, sr, keptReason, sp) return } // trace hasn't already been sent (or this span is really old); let's @@ -622,18 +622,18 @@ func (i *InMemCollector) ProcessSpanImmediately(sp *types.Span) (processed bool, // dealWithSentTrace handles a span that has arrived after the sampling decision // on the trace has already been made, and it obeys that decision by either // sending the span immediately or dropping it. -func (i *InMemCollector) dealWithSentTrace(ctx context.Context, tr cache.TraceSentRecord, sentReason string, sp *types.Span) { +func (i *InMemCollector) dealWithSentTrace(ctx context.Context, tr cache.TraceSentRecord, keptReason string, sp *types.Span) { _, span := otelutil.StartSpanMulti(ctx, i.Tracer, "dealWithSentTrace", map[string]interface{}{ "trace_id": sp.TraceID, - "sent_reason": sentReason, + "kept_reason": keptReason, "hostname": i.hostname, }) defer span.End() if i.Config.GetAddRuleReasonToTrace() { var metaReason string - if len(sentReason) > 0 { - metaReason = fmt.Sprintf("%s - late arriving span", sentReason) + if len(keptReason) > 0 { + metaReason = fmt.Sprintf("%s - late arriving span", keptReason) } else { metaReason = "late arriving span" } diff --git a/types/event.go b/types/event.go index 4a9c9f1979..c6fc76039b 100644 --- a/types/event.go +++ b/types/event.go @@ -47,7 +47,7 @@ type Trace struct { KeepSample bool // Sent should only be changed if the changer holds the SendSampleLock Sent bool - sentReason uint + keptReason uint SendBy time.Time @@ -114,12 +114,12 @@ func (t *Trace) SetSampleRate(rate uint) { t.sampleRate = rate } -func (t *Trace) SentReason() uint { - return t.sentReason +func (t *Trace) KeptReason() uint { + return t.keptReason } -func (t *Trace) SetSentReason(reason uint) { - t.sentReason = reason +func (t *Trace) SetKeptReason(reason uint) { + t.keptReason = reason } // DescendantCount gets the number of descendants of all kinds currently in this trace From 637a3f71063e98c8257867d1ddfd709c95a1c0f1 Mon Sep 17 00:00:00 2001 From: Kent Quirk Date: Mon, 23 Sep 2024 10:58:22 -0400 Subject: [PATCH 07/25] feat: Improve shutdown logic (#1347) ## Which problem is this PR solving? - If part of Refinery has a problem shutting down, there's no way to force it other than to do a `kill -9`, which is a pain. ## Short description of the changes - Call the Stop() routines in the shutdown rather than in a defer() - Instead of simply blocking on the signal channel, monitor that channel in a goroutine. If more than one shutdown signal is received, exit immediately without trying to be clean about it. - Also, if a shutdown signal is received and the health system is already "not alive", just die immediately; there's a good chance the system won't be able to do an orderly shutdown. --- cmd/refinery/main.go | 54 ++++++++++++++++++++++++++++++++++++++------ 1 file changed, 47 insertions(+), 7 deletions(-) diff --git a/cmd/refinery/main.go b/cmd/refinery/main.go index d708c7915a..01f283c319 100644 --- a/cmd/refinery/main.go +++ b/cmd/refinery/main.go @@ -220,6 +220,8 @@ func main() { oTelMetrics = &metrics.OTelMetrics{} } + refineryHealth := &health.Health{} + resourceLib := "refinery" resourceVer := version tracer := trace.Tracer(noop.Tracer{}) @@ -260,7 +262,7 @@ func main() { {Value: version, Name: "version"}, {Value: samplerFactory}, {Value: stressRelief, Name: "stressRelief"}, - {Value: &health.Health{}}, + {Value: refineryHealth}, {Value: &configwatcher.ConfigWatcher{}}, {Value: &a}, } @@ -285,14 +287,11 @@ func main() { // the logger provided to startstop must be valid before any service is // started, meaning it can't rely on injected configs. make a custom logger - // just for this step + // just for this step (and for shutdown) ststLogger := logrus.New() // level, _ := logrus.ParseLevel(logLevel) ststLogger.SetLevel(logrus.DebugLevel) - // we can stop all the objects in one call, but we need to start the - // transmissions manually. - defer startstop.Stop(g.Objects(), ststLogger) if err := startstop.Start(g.Objects(), ststLogger); err != nil { fmt.Printf("failed to start injected dependencies. error: %+v\n", err) os.Exit(1) @@ -325,14 +324,55 @@ func main() { metricsSingleton.Store("UPSTREAM_BUFFER_SIZE", float64(c.GetUpstreamBufferSize())) metricsSingleton.Store("PEER_BUFFER_SIZE", float64(c.GetPeerBufferSize())) - // set up signal channel to exit + // set up signal channel to exit, and allow a second try to kill everything + // immediately. sigsToExit := make(chan os.Signal, 1) + // this is the signal that the goroutine sends + exitWait := make(chan struct{}) + // this is the channel the goroutine uses to stop; it has to be unique since it's + // the last thing we do + monitorDone := make(chan struct{}) + // the signal gets sent to sigsToExit, which is monitored by the goroutine below signal.Notify(sigsToExit, syscall.SIGINT, syscall.SIGTERM) + go func() { + // first attempt does a normal close + forceExit := false + for { + select { + case <-sigsToExit: + // if some part of refinery is already dead, don't + // attempt an orderly shutdown, just die + if !refineryHealth.IsAlive() { + ststLogger.Logf(logrus.ErrorLevel, "at least one subsystem is not alive, exiting immediately") + os.Exit(1) + } + + // if this is true they've tried more than once, so get out + // without trying to be clean about it + if forceExit { + ststLogger.Logf(logrus.ErrorLevel, "immediate exit forced by second signal") + os.Exit(2) + } + forceExit = true + close(exitWait) + case <-monitorDone: + return + } + } + }() + // block on our signal handler to exit - sig := <-sigsToExit + sig := <-exitWait // unregister ourselves before we go close(done) time.Sleep(100 * time.Millisecond) a.Logger.Error().WithField("signal", sig).Logf("Caught OS signal") + + // these are the subsystems that might not shut down properly, so we're + // going to call this manually so that if something blocks on shutdown, you + // can still send a signal that will get heard. + startstop.Stop(g.Objects(), ststLogger) + close(monitorDone) + close(sigsToExit) } From 8c3b91a1b9523412d84bd0045d7ce7a8ac8f9358 Mon Sep 17 00:00:00 2001 From: Yingrong Zhao <22300958+VinozzZ@users.noreply.github.com> Date: Mon, 23 Sep 2024 12:53:34 -0400 Subject: [PATCH 08/25] fix: remove InMemoryCollector from liveness check on shutdown (#1349) ## Which problem is this PR solving? related to #1348 Currently, the collector's timeout for healthcheck is set to 3 seconds. During a shutdown, there could be a large number of traces or spans that need to be forwarded to peers. If this is taking longer than 3 seconds, it will definitely timeout the liveness check. Since we are shutting down anyway, just let the ShutdownDelay to do the work and not worry about the liveness. ## Short description of the changes - Unregister `collector` from Refinery's liveness check --- collect/collect.go | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/collect/collect.go b/collect/collect.go index fac8dad151..1d32e7db71 100644 --- a/collect/collect.go +++ b/collect/collect.go @@ -850,9 +850,10 @@ func (i *InMemCollector) send(trace *types.Trace, sendReason string) { func (i *InMemCollector) Stop() error { i.redistributeTimer.Stop() close(i.done) - // signal the health system to not be ready + // signal the health system to not be ready and + // stop liveness check // so that no new traces are accepted - i.Health.Ready(CollectorHealthKey, false) + i.Health.Unregister(CollectorHealthKey) i.mutex.Lock() From 7528846a17ef91cfc673399e9ca63991789039e4 Mon Sep 17 00:00:00 2001 From: Yingrong Zhao <22300958+VinozzZ@users.noreply.github.com> Date: Thu, 26 Sep 2024 10:07:44 -0400 Subject: [PATCH 09/25] fix: set 0 for otel metrics during registration (#1352) --- cmd/refinery/main.go | 5 +++++ metrics/otel_metrics.go | 10 ++++++++++ transmit/mock.go | 2 ++ transmit/transmit.go | 18 ++++++++++++------ 4 files changed, 29 insertions(+), 6 deletions(-) diff --git a/cmd/refinery/main.go b/cmd/refinery/main.go index 01f283c319..36620f7a88 100644 --- a/cmd/refinery/main.go +++ b/cmd/refinery/main.go @@ -316,11 +316,16 @@ func main() { "messages_sent": "counter", "response_decode_errors": "counter", } + for name, typ := range libhoneyMetricsName { upstreamMetricsRecorder.Register(name, typ) peerMetricsRecorder.Register(name, typ) } + // Register metrics after the metrics object has been created + peerTransmission.RegisterMetrics() + upstreamTransmission.RegisterMetrics() + metricsSingleton.Store("UPSTREAM_BUFFER_SIZE", float64(c.GetUpstreamBufferSize())) metricsSingleton.Store("PEER_BUFFER_SIZE", float64(c.GetPeerBufferSize())) diff --git a/metrics/otel_metrics.go b/metrics/otel_metrics.go index 7688e1ecf0..bf5b67fdaf 100644 --- a/metrics/otel_metrics.go +++ b/metrics/otel_metrics.go @@ -150,6 +150,7 @@ func (o *OTelMetrics) Start() error { if err != nil { return err } + o.gauges[name] = g name = "memory_inuse" @@ -185,6 +186,7 @@ func (o *OTelMetrics) Register(name string, metricType string) { o.lock.Lock() defer o.lock.Unlock() + ctx := context.Background() switch metricType { case "counter": ctr, err := o.meter.Int64Counter(name) @@ -192,6 +194,9 @@ func (o *OTelMetrics) Register(name string, metricType string) { o.Logger.Error().WithString("name", name).Logf("failed to create counter") return } + // Initialize the counter to 0 so that it will be reported + ctr.Add(ctx, 0) + o.counters[name] = ctr case "gauge": var f metric.Float64Callback = func(_ context.Context, result metric.Float64Observer) error { @@ -213,6 +218,7 @@ func (o *OTelMetrics) Register(name string, metricType string) { o.Logger.Error().WithString("name", name).Logf("failed to create gauge") return } + o.gauges[name] = g case "histogram": h, err := o.meter.Float64Histogram(name) @@ -220,6 +226,8 @@ func (o *OTelMetrics) Register(name string, metricType string) { o.Logger.Error().WithString("name", name).Logf("failed to create histogram") return } + + h.Record(ctx, 0) o.histograms[name] = h case "updown": ud, err := o.meter.Int64UpDownCounter(name) @@ -227,6 +235,8 @@ func (o *OTelMetrics) Register(name string, metricType string) { o.Logger.Error().WithString("name", name).Logf("failed to create updown counter") return } + + ud.Add(ctx, 0) o.updowns[name] = ud default: o.Logger.Error().WithString("type", metricType).Logf("unknown metric type") diff --git a/transmit/mock.go b/transmit/mock.go index c018131a96..66d2ac5019 100644 --- a/transmit/mock.go +++ b/transmit/mock.go @@ -31,3 +31,5 @@ func (m *MockTransmission) Flush() { defer m.Mux.Unlock() m.Events = m.Events[:0] } + +func (m *MockTransmission) RegisterMetrics() {} diff --git a/transmit/transmit.go b/transmit/transmit.go index 6a5790646e..3a3cad515b 100644 --- a/transmit/transmit.go +++ b/transmit/transmit.go @@ -20,6 +20,8 @@ type Transmission interface { EnqueueSpan(ev *types.Span) // Flush flushes the in-flight queue of all events and spans Flush() + + RegisterMetrics() } const ( @@ -65,12 +67,6 @@ func (d *DefaultTransmission) Start() error { libhoney.UserAgentAddition = "refinery/" + d.Version }) - d.Metrics.Register(counterEnqueueErrors, "counter") - d.Metrics.Register(counterResponse20x, "counter") - d.Metrics.Register(counterResponseErrors, "counter") - d.Metrics.Register(updownQueuedItems, "updown") - d.Metrics.Register(histogramQueueTime, "histogram") - processCtx, canceler := context.WithCancel(context.Background()) d.responseCanceler = canceler go d.processResponses(processCtx, d.LibhClient.TxResponses()) @@ -141,6 +137,16 @@ func (d *DefaultTransmission) Flush() { d.LibhClient.Flush() } +// RegisterMetrics registers the metrics used by the DefaultTransmission. +// it should be called after the metrics object has been created. +func (d *DefaultTransmission) RegisterMetrics() { + d.Metrics.Register(counterEnqueueErrors, "counter") + d.Metrics.Register(counterResponse20x, "counter") + d.Metrics.Register(counterResponseErrors, "counter") + d.Metrics.Register(updownQueuedItems, "updown") + d.Metrics.Register(histogramQueueTime, "histogram") +} + func (d *DefaultTransmission) Stop() error { // signal processResponses to stop if d.responseCanceler != nil { From 209cf9908262f177a72d63779481793e3112bb60 Mon Sep 17 00:00:00 2001 From: Yingrong Zhao <22300958+VinozzZ@users.noreply.github.com> Date: Thu, 26 Sep 2024 15:35:50 -0400 Subject: [PATCH 10/25] maint: Refactor metrics registration to streamline declaration and enable easier documentation generation (#1350) ## Which problem is this PR solving? - part 1 for: #1152 This PR prepares us to have a consistent metrics registration pattern so that in a later PR we can use https://pkg.go.dev/golang.org/x/tools/go/packages to automatically generate metrics documentation. - part 2 is implemented in #1351 ## Short description of the changes - Introduce a new struct type `metrics.Metadata` to represent all information needed for a Refinery metric - Change `metrics.Register` signature to accept `metrics.Metadata` as its argument - Refactor metrics registration calls in each packages so that we declare metrics we want to register first in a package level variable first - Set zero value for each OTel metrics during registration - set Unit and Description for each OTel metrics --- app/app.go | 20 +++++++- cmd/refinery/main.go | 61 ++++++++++++++++++------ collect/cache/cache.go | 18 +++++--- collect/cache/cuckoo.go | 9 ++++ collect/cache/cuckooSentCache.go | 8 +++- collect/cache/kept_reasons_cache.go | 12 +++-- collect/collect.go | 72 +++++++++++++++-------------- collect/stressRelief.go | 14 ++++-- collect/stress_relief_test.go | 15 ++++-- internal/health/health.go | 8 ++++ internal/peer/file.go | 8 +++- internal/peer/pubsub_redis.go | 12 +++-- metrics/legacy.go | 26 ++++++----- metrics/legacy_test.go | 5 +- metrics/metrics.go | 46 +++++++++++++++++- metrics/metricsnamer.go | 7 ++- metrics/mock.go | 6 ++- metrics/multi_metrics.go | 8 ++-- metrics/multi_metrics_test.go | 15 ++++-- metrics/nullmetrics.go | 20 ++++---- metrics/otel_metrics.go | 63 +++++++++++++++---------- metrics/prometheus.go | 34 ++++++++------ metrics/prometheus_test.go | 22 +++++++-- pubsub/pubsub_goredis.go | 10 +++- pubsub/pubsub_local.go | 11 ++++- route/route.go | 16 ++++--- sample/deterministic.go | 11 +++-- sample/dynamic.go | 37 ++++----------- sample/dynamic_ema.go | 36 +++++---------- sample/ema_throughput.go | 35 ++++---------- sample/rules.go | 23 +++++---- sample/sample.go | 56 ++++++++++++++++++++-- sample/totalthroughput.go | 37 ++++----------- sample/windowed_throughput.go | 37 ++++----------- sample/windowed_throughput_test.go | 2 +- transmit/transmit.go | 16 +++++-- 36 files changed, 525 insertions(+), 311 deletions(-) diff --git a/app/app.go b/app/app.go index de1ee309a3..8469454b3b 100644 --- a/app/app.go +++ b/app/app.go @@ -42,8 +42,9 @@ func (a *App) Start() error { } a.Logger.Debug().Logf("Starting up App...") - a.Metrics.Register("config_hash", "gauge") - a.Metrics.Register("rule_config_hash", "gauge") + for _, metric := range configHashMetrics { + a.Metrics.Register(metric) + } a.IncomingRouter.SetVersion(a.Version) a.PeerRouter.SetVersion(a.Version) @@ -64,3 +65,18 @@ func (a *App) Stop() error { a.Logger.Debug().Logf("Shutting down App...") return nil } + +var configHashMetrics = []metrics.Metadata{ + metrics.Metadata{ + Name: "config_hash", + Type: metrics.Gauge, + Unit: metrics.Dimensionless, + Description: "The hash of the current configuration", + }, + metrics.Metadata{ + Name: "rule_config_hash", + Type: metrics.Gauge, + Unit: metrics.Dimensionless, + Description: "The hash of the current rules configuration", + }, +} diff --git a/cmd/refinery/main.go b/cmd/refinery/main.go index 36620f7a88..4dce9549f1 100644 --- a/cmd/refinery/main.go +++ b/cmd/refinery/main.go @@ -307,19 +307,9 @@ func main() { // these have to be done after the injection (of metrics) // these are the metrics that libhoney will emit; we preregister them so that they always appear - libhoneyMetricsName := map[string]string{ - "queue_length": "gauge", - "queue_overflow": "counter", - "send_errors": "counter", - "send_retries": "counter", - "batches_sent": "counter", - "messages_sent": "counter", - "response_decode_errors": "counter", - } - - for name, typ := range libhoneyMetricsName { - upstreamMetricsRecorder.Register(name, typ) - peerMetricsRecorder.Register(name, typ) + for _, metric := range libhoneyMetrics { + upstreamMetricsRecorder.Register(metric) + peerMetricsRecorder.Register(metric) } // Register metrics after the metrics object has been created @@ -381,3 +371,48 @@ func main() { close(monitorDone) close(sigsToExit) } + +var libhoneyMetrics = []metrics.Metadata{ + metrics.Metadata{ + Name: "queue_length", + Type: metrics.Gauge, + Unit: metrics.Dimensionless, + Description: "number of events waiting to be sent to destination", + }, + metrics.Metadata{ + Name: "queue_overflow", + Type: metrics.Counter, + Unit: metrics.Dimensionless, + Description: "number of events dropped due to queue overflow", + }, + metrics.Metadata{ + Name: "send_errors", + Type: metrics.Counter, + Unit: metrics.Dimensionless, + Description: "number of errors encountered while sending events to destination", + }, + metrics.Metadata{ + Name: "send_retries", + Type: metrics.Counter, + Unit: metrics.Dimensionless, + Description: "number of times a batch of events was retried", + }, + metrics.Metadata{ + Name: "batches_sent", + Type: metrics.Counter, + Unit: metrics.Dimensionless, + Description: "number of batches of events sent to destination", + }, + metrics.Metadata{ + Name: "messages_sent", + Type: metrics.Counter, + Unit: metrics.Dimensionless, + Description: "number of messages sent to destination", + }, + metrics.Metadata{ + Name: "response_decode_errors", + Type: metrics.Counter, + Unit: metrics.Dimensionless, + Description: "number of errors encountered while decoding responses from destination", + }, +} diff --git a/collect/cache/cache.go b/collect/cache/cache.go index 1c7c3ca12d..7c46b8106a 100644 --- a/collect/cache/cache.go +++ b/collect/cache/cache.go @@ -49,26 +49,30 @@ type DefaultInMemCache struct { const DefaultInMemCacheCapacity = 10000 +var collectCacheMetrics = []metrics.Metadata{ + {Name: "collect_cache_buffer_overrun", Type: metrics.Counter, Unit: metrics.Dimensionless, Description: "The number of times the trace overwritten in the circular buffer has not yet been sent"}, + {Name: "collect_cache_capacity", Type: metrics.Gauge, Unit: metrics.Dimensionless, Description: "The number of traces that can be stored in the cache"}, + {Name: "collect_cache_entries", Type: metrics.Histogram, Unit: metrics.Dimensionless, Description: "The number of traces currently stored in the cache"}, +} + func NewInMemCache( capacity int, - metrics metrics.Metrics, + met metrics.Metrics, logger logger.Logger, ) *DefaultInMemCache { logger.Debug().Logf("Starting DefaultInMemCache") defer func() { logger.Debug().Logf("Finished starting DefaultInMemCache") }() - // buffer_overrun increments when the trace overwritten in the circular - // buffer has not yet been sent - metrics.Register("collect_cache_buffer_overrun", "counter") - metrics.Register("collect_cache_capacity", "gauge") - metrics.Register("collect_cache_entries", "histogram") + for _, metadata := range collectCacheMetrics { + met.Register(metadata) + } if capacity == 0 { capacity = DefaultInMemCacheCapacity } return &DefaultInMemCache{ - Metrics: metrics, + Metrics: met, Logger: logger, cache: make(map[string]*types.Trace, capacity), traceBuffer: make([]*types.Trace, capacity), diff --git a/collect/cache/cuckoo.go b/collect/cache/cuckoo.go index e6c4e4774d..5541161fa9 100644 --- a/collect/cache/cuckoo.go +++ b/collect/cache/cuckoo.go @@ -44,6 +44,12 @@ const ( AddQueueSleepTime = 100 * time.Microsecond ) +var cuckooTraceCheckerMetrics = []metrics.Metadata{ + {Name: CurrentCapacity, Type: metrics.Gauge, Unit: metrics.Dimensionless, Description: "current capacity of the cuckoo filter"}, + {Name: FutureLoadFactor, Type: metrics.Gauge, Unit: metrics.Percent, Description: "the fraction of slots occupied in the future cuckoo filter"}, + {Name: CurrentLoadFactor, Type: metrics.Gauge, Unit: metrics.Percent, Description: "the fraction of slots occupied in the current cuckoo filter"}, +} + func NewCuckooTraceChecker(capacity uint, m metrics.Metrics) *CuckooTraceChecker { c := &CuckooTraceChecker{ capacity: capacity, @@ -52,6 +58,9 @@ func NewCuckooTraceChecker(capacity uint, m metrics.Metrics) *CuckooTraceChecker met: m, addch: make(chan string, AddQueueDepth), } + for _, metric := range cuckooTraceCheckerMetrics { + m.Register(metric) + } // To try to avoid blocking on Add, we have a goroutine that pulls from a // channel and adds to the filter. diff --git a/collect/cache/cuckooSentCache.go b/collect/cache/cuckooSentCache.go index df592c261c..e03ee8ba93 100644 --- a/collect/cache/cuckooSentCache.go +++ b/collect/cache/cuckooSentCache.go @@ -161,6 +161,10 @@ type cuckooSentCache struct { // Make sure it implements TraceSentCache var _ TraceSentCache = (*cuckooSentCache)(nil) +var cuckooSentCacheMetrics = []metrics.Metadata{ + {Name: "cache_recent_dropped_traces", Type: metrics.Gauge, Unit: metrics.Dimensionless, Description: "the current size of the most recent dropped trace cache"}, +} + func NewCuckooSentCache(cfg config.SampleCacheConfig, met metrics.Metrics) (TraceSentCache, error) { stc, err := lru.New[string, *keptTraceCacheEntry](int(cfg.KeptSize)) if err != nil { @@ -180,7 +184,9 @@ func NewCuckooSentCache(cfg config.SampleCacheConfig, met metrics.Metrics) (Trac // request. recentDroppedIDs := generics.NewSetWithTTL[string](3 * time.Second) - met.Register("cache_recent_dropped_traces", "gauge") + for _, metric := range cuckooSentCacheMetrics { + met.Register(metric) + } cache := &cuckooSentCache{ met: met, diff --git a/collect/cache/kept_reasons_cache.go b/collect/cache/kept_reasons_cache.go index 69469dfd7d..22fe39f882 100644 --- a/collect/cache/kept_reasons_cache.go +++ b/collect/cache/kept_reasons_cache.go @@ -22,12 +22,18 @@ type KeptReasonsCache struct { hashSeed uint64 } +var keptReasonCacheMetrics = []metrics.Metadata{ + {Name: "collect_sent_reasons_cache_entries", Type: metrics.Histogram, Unit: metrics.Dimensionless, Description: "Number of entries in the sent reasons cache"}, +} + // NewKeptReasonsCache returns a new SentReasonsCache. -func NewKeptReasonsCache(metrics metrics.Metrics) *KeptReasonsCache { - metrics.Register("collect_sent_reasons_cache_entries", "histogram") +func NewKeptReasonsCache(met metrics.Metrics) *KeptReasonsCache { + for _, metric := range keptReasonCacheMetrics { + met.Register(metric) + } return &KeptReasonsCache{ - Metrics: metrics, + Metrics: met, keys: make(map[uint64]uint32), hashSeed: rand.Uint64(), } diff --git a/collect/collect.go b/collect/collect.go index 1d32e7db71..58dd6ba6f0 100644 --- a/collect/collect.go +++ b/collect/collect.go @@ -95,6 +95,39 @@ type InMemCollector struct { hostname string } +var inMemCollectorMetrics = []metrics.Metadata{ + {Name: "trace_duration_ms", Type: metrics.Histogram, Unit: metrics.Milliseconds, Description: "time taken to process a trace from arrival to send"}, + {Name: "trace_span_count", Type: metrics.Histogram, Unit: metrics.Dimensionless, Description: "number of spans in a trace"}, + {Name: "collector_incoming_queue", Type: metrics.Histogram, Unit: metrics.Dimensionless, Description: "number of spans currently in the incoming queue"}, + {Name: "collector_peer_queue_length", Type: metrics.Gauge, Unit: metrics.Dimensionless, Description: "number of spans in the peer queue"}, + {Name: "collector_incoming_queue_length", Type: metrics.Gauge, Unit: metrics.Dimensionless, Description: "number of spans in the incoming queue"}, + {Name: "collector_peer_queue", Type: metrics.Histogram, Unit: metrics.Dimensionless, Description: "number of spans currently in the peer queue"}, + {Name: "collector_cache_size", Type: metrics.Gauge, Unit: metrics.Dimensionless, Description: "number of traces currently stored in the trace cache"}, + {Name: "memory_heap_allocation", Type: metrics.Gauge, Unit: metrics.Bytes, Description: "current heap allocation"}, + {Name: "span_received", Type: metrics.Counter, Unit: metrics.Dimensionless, Description: "number of spans received by the collector"}, + {Name: "span_processed", Type: metrics.Counter, Unit: metrics.Dimensionless, Description: "number of spans processed by the collector"}, + {Name: "spans_waiting", Type: metrics.UpDown, Unit: metrics.Dimensionless, Description: "number of spans waiting to be processed by the collector"}, + {Name: "trace_sent_cache_hit", Type: metrics.Counter, Unit: metrics.Dimensionless, Description: "number of late spans received for traces that have already been sent"}, + {Name: "trace_accepted", Type: metrics.Counter, Unit: metrics.Dimensionless, Description: "number of new traces received by the collector"}, + {Name: "trace_send_kept", Type: metrics.Counter, Unit: metrics.Dimensionless, Description: "number of traces that has been kept"}, + {Name: "trace_send_dropped", Type: metrics.Counter, Unit: metrics.Dimensionless, Description: "number of traces that has been dropped"}, + {Name: "trace_send_has_root", Type: metrics.Counter, Unit: metrics.Dimensionless, Description: "number of kept traces that have a root span"}, + {Name: "trace_send_no_root", Type: metrics.Counter, Unit: metrics.Dimensionless, Description: "number of kept traces that do not have a root span"}, + {Name: "trace_forwarded_on_peer_change", Type: metrics.Gauge, Unit: metrics.Dimensionless, Description: "number of traces forwarded due to peer membership change"}, + {Name: "trace_redistribution_count", Type: metrics.Gauge, Unit: metrics.Dimensionless, Description: "number of traces redistributed due to peer membership change"}, + {Name: "trace_send_on_shutdown", Type: metrics.Counter, Unit: metrics.Dimensionless, Description: "number of traces sent during shutdown"}, + {Name: "trace_forwarded_on_shutdown", Type: metrics.Counter, Unit: metrics.Dimensionless, Description: "number of traces forwarded during shutdown"}, + + {Name: TraceSendGotRoot, Type: metrics.Counter, Unit: metrics.Dimensionless, Description: "number of traces that are ready for decision due to root span arrival"}, + {Name: TraceSendExpired, Type: metrics.Counter, Unit: metrics.Dimensionless, Description: "number of traces that are ready for decision due to TraceTimeout or SendDelay"}, + {Name: TraceSendSpanLimit, Type: metrics.Counter, Unit: metrics.Dimensionless, Description: "number of traces that are ready for decision due to span limit"}, + {Name: TraceSendEjectedFull, Type: metrics.Counter, Unit: metrics.Dimensionless, Description: "number of traces that are ready for decision due to cache capacity overrun"}, + {Name: TraceSendEjectedMemsize, Type: metrics.Counter, Unit: metrics.Dimensionless, Description: "number of traces that are ready for decision due to memory overrun"}, + {Name: TraceSendLateSpan, Type: metrics.Counter, Unit: metrics.Dimensionless, Description: "number of spans that are sent due to late span arrival"}, + + {Name: "dropped_from_stress", Type: metrics.Counter, Unit: metrics.Dimensionless, Description: "number of traces dropped due to stress relief"}, +} + func (i *InMemCollector) Start() error { i.Logger.Debug().Logf("Starting InMemCollector") defer func() { i.Logger.Debug().Logf("Finished starting InMemCollector") }() @@ -107,39 +140,11 @@ func (i *InMemCollector) Start() error { i.Health.Register(CollectorHealthKey, time.Duration(imcConfig.HealthCheckTimeout)) - i.Metrics.Register("trace_duration_ms", "histogram") - i.Metrics.Register("trace_span_count", "histogram") - i.Metrics.Register("collector_incoming_queue", "histogram") - i.Metrics.Register("collector_peer_queue_length", "gauge") - i.Metrics.Register("collector_incoming_queue_length", "gauge") - i.Metrics.Register("collector_peer_queue", "histogram") - i.Metrics.Register("collector_cache_size", "gauge") - i.Metrics.Register("memory_heap_allocation", "gauge") - i.Metrics.Register("span_received", "counter") - i.Metrics.Register("span_processed", "counter") - i.Metrics.Register("spans_waiting", "updown") - i.Metrics.Register("trace_sent_cache_hit", "counter") - i.Metrics.Register("trace_accepted", "counter") - i.Metrics.Register("trace_send_kept", "counter") - i.Metrics.Register("trace_send_dropped", "counter") - i.Metrics.Register("trace_send_has_root", "counter") - i.Metrics.Register("trace_send_no_root", "counter") - i.Metrics.Register("trace_forwarded_on_peer_change", "gauge") - i.Metrics.Register("trace_redistribution_count", "gauge") - i.Metrics.Register("trace_send_on_shutdown", "counter") - i.Metrics.Register("trace_forwarded_on_shutdown", "counter") - - i.Metrics.Register(TraceSendGotRoot, "counter") - i.Metrics.Register(TraceSendExpired, "counter") - i.Metrics.Register(TraceSendSpanLimit, "counter") - i.Metrics.Register(TraceSendEjectedFull, "counter") - i.Metrics.Register(TraceSendEjectedMemsize, "counter") - i.Metrics.Register(TraceSendLateSpan, "counter") + for _, metric := range inMemCollectorMetrics { + i.Metrics.Register(metric) + } sampleCacheConfig := i.Config.GetSampleCacheConfig() - i.Metrics.Register(cache.CurrentCapacity, "gauge") - i.Metrics.Register(cache.FutureLoadFactor, "gauge") - i.Metrics.Register(cache.CurrentLoadFactor, "gauge") var err error i.sampleTraceCache, err = cache.NewCuckooSentCache(sampleCacheConfig, i.Metrics) if err != nil { @@ -1063,7 +1068,7 @@ func (i *InMemCollector) addAdditionalAttributes(sp *types.Span) { } } -func newRedistributeNotifier(logger logger.Logger, metrics metrics.Metrics, clock clockwork.Clock) *redistributeNotifier { +func newRedistributeNotifier(logger logger.Logger, met metrics.Metrics, clock clockwork.Clock) *redistributeNotifier { r := &redistributeNotifier{ initialDelay: 3 * time.Second, maxDelay: 30 * time.Second, @@ -1071,11 +1076,10 @@ func newRedistributeNotifier(logger logger.Logger, metrics metrics.Metrics, cloc done: make(chan struct{}), clock: clock, logger: logger, - metrics: metrics, + metrics: met, triggered: make(chan struct{}), reset: make(chan struct{}), } - r.metrics.Register("trace_redistribution_count", "gauge") return r } diff --git a/collect/stressRelief.go b/collect/stressRelief.go index 893d43b7ba..83da176283 100644 --- a/collect/stressRelief.go +++ b/collect/stressRelief.go @@ -107,6 +107,13 @@ type StressRelief struct { const StressReliefHealthKey = "stress_relief" +var stressReliefMetrics = []metrics.Metadata{ + {Name: "cluster_stress_level", Type: metrics.Gauge, Unit: metrics.Dimensionless, Description: "The overall stress level of the cluster"}, + {Name: "individual_stress_level", Type: metrics.Gauge, Unit: metrics.Dimensionless, Description: "The stress level of the individual node"}, + {Name: "stress_level", Type: metrics.Gauge, Unit: metrics.Dimensionless, Description: "The stress level that's being used to determine whether to activate stress relief"}, + {Name: "stress_relief_activated", Type: metrics.Gauge, Unit: metrics.Dimensionless, Description: "Whether stress relief is currently activated"}, +} + func (s *StressRelief) Start() error { s.Logger.Debug().Logf("Starting StressRelief system") defer func() { s.Logger.Debug().Logf("Finished starting StressRelief system") }() @@ -115,10 +122,9 @@ func (s *StressRelief) Start() error { s.Health.Register(StressReliefHealthKey, 3*time.Second) // register stress level metrics - s.RefineryMetrics.Register("cluster_stress_level", "gauge") - s.RefineryMetrics.Register("individual_stress_level", "gauge") - s.RefineryMetrics.Register("stress_level", "gauge") - s.RefineryMetrics.Register("stress_relief_activated", "gauge") + for _, m := range stressReliefMetrics { + s.RefineryMetrics.Register(m) + } // We use an algorithms map so that we can name these algorithms, which makes it easier for several things: // - change our mind about which algorithm to use diff --git a/collect/stress_relief_test.go b/collect/stress_relief_test.go index 45b57a9958..6db1ee848b 100644 --- a/collect/stress_relief_test.go +++ b/collect/stress_relief_test.go @@ -27,7 +27,10 @@ func TestStressRelief_Monitor(t *testing.T) { defer stop() require.NoError(t, sr.Start()) - sr.RefineryMetrics.Register("collector_incoming_queue_length", "gauge") + sr.RefineryMetrics.Register(metrics.Metadata{ + Name: "collector_incoming_queue_length", + Type: metrics.Gauge, + }) sr.RefineryMetrics.Store("INCOMING_CAP", 1200) @@ -81,7 +84,10 @@ func TestStressRelief_Peer(t *testing.T) { defer stop() require.NoError(t, sr.Start()) - sr.RefineryMetrics.Register("collector_incoming_queue_length", "gauge") + sr.RefineryMetrics.Register(metrics.Metadata{ + Name: "collector_incoming_queue_length", + Type: metrics.Gauge, + }) sr.RefineryMetrics.Store("INCOMING_CAP", 1200) @@ -139,7 +145,10 @@ func TestStressRelief_OverallStressLevel(t *testing.T) { sr.disableStressLevelReport = true sr.Start() - sr.RefineryMetrics.Register("collector_incoming_queue_length", "gauge") + sr.RefineryMetrics.Register(metrics.Metadata{ + Name: "collector_incoming_queue_length", + Type: metrics.Gauge, + }) sr.RefineryMetrics.Store("INCOMING_CAP", 1200) diff --git a/internal/health/health.go b/internal/health/health.go index 96794388b5..e2405ba4ee 100644 --- a/internal/health/health.go +++ b/internal/health/health.go @@ -68,6 +68,11 @@ type Health struct { Reporter } +var healthMetrics = []metrics.Metadata{ + {Name: "is_ready", Type: metrics.Gauge, Unit: metrics.Dimensionless, Description: "Whether the system is ready to receive traffic"}, + {Name: "is_alive", Type: metrics.Gauge, Unit: metrics.Dimensionless, Description: "Whether the system is alive and reporting in"}, +} + func (h *Health) Start() error { // if we don't have a logger or metrics object, we'll use the null ones (makes testing easier) if h.Logger == nil { @@ -76,6 +81,9 @@ func (h *Health) Start() error { if h.Metrics == nil { h.Metrics = &metrics.NullMetrics{} } + for _, metric := range healthMetrics { + h.Metrics.Register(metric) + } h.timeouts = make(map[string]time.Duration) h.timeLeft = make(map[string]time.Duration) h.readies = make(map[string]bool) diff --git a/internal/peer/file.go b/internal/peer/file.go index b995183152..911658d9c5 100644 --- a/internal/peer/file.go +++ b/internal/peer/file.go @@ -45,8 +45,14 @@ func (p *FilePeers) RegisterUpdatedPeersCallback(callback func()) { callback() } +var filePeersMetrics = []metrics.Metadata{ + {Name: "num_file_peers", Type: metrics.Gauge, Unit: metrics.Dimensionless, Description: "Number of peers in the file peer list"}, +} + func (p *FilePeers) Start() (err error) { - p.Metrics.Register("num_file_peers", "gauge") + for _, metric := range filePeersMetrics { + p.Metrics.Register(metric) + } p.id, err = p.publicAddr() if err != nil { diff --git a/internal/peer/pubsub_redis.go b/internal/peer/pubsub_redis.go index 99d656b40b..5e00e31b78 100644 --- a/internal/peer/pubsub_redis.go +++ b/internal/peer/pubsub_redis.go @@ -120,6 +120,12 @@ func (p *RedisPubsubPeers) listen(ctx context.Context, msg string) { p.checkHash() } +var redisPubSubPeersMetrics = []metrics.Metadata{ + {Name: "num_peers", Type: metrics.Gauge, Unit: metrics.Dimensionless, Description: "the active number of peers in the cluster"}, + {Name: "peer_hash", Type: metrics.Gauge, Unit: metrics.Dimensionless, Description: "the hash of the current list of peers"}, + {Name: "peer_messages", Type: metrics.Counter, Unit: metrics.Dimensionless, Description: "the number of messages received by the peers service"}, +} + func (p *RedisPubsubPeers) Start() error { if p.PubSub == nil { return errors.New("injected pubsub is nil") @@ -137,9 +143,9 @@ func (p *RedisPubsubPeers) Start() error { p.Logger.Info().Logf("subscribing to pubsub peers channel") p.sub = p.PubSub.Subscribe(context.Background(), "peers", p.listen) - p.Metrics.Register("num_peers", "gauge") - p.Metrics.Register("peer_hash", "gauge") - p.Metrics.Register("peer_messages", "counter") + for _, metric := range redisPubSubPeersMetrics { + p.Metrics.Register(metric) + } myaddr, err := p.publicAddr() if err != nil { diff --git a/metrics/legacy.go b/metrics/legacy.go index 8df829d355..e2bec8cbd2 100644 --- a/metrics/legacy.go +++ b/metrics/legacy.go @@ -17,6 +17,8 @@ import ( "github.com/honeycombio/refinery/logger" ) +var _ Metrics = (*LegacyMetrics)(nil) + type LegacyMetrics struct { Config config.Config `inject:""` Logger logger.Logger `inject:""` @@ -304,19 +306,19 @@ func average(vals []float64) float64 { return total / float64(len(vals)) } -func (h *LegacyMetrics) Register(name string, metricType string) { - h.Logger.Debug().Logf("metrics registering %s with name %s", metricType, name) - switch metricType { - case "counter": - getOrAdd(&h.lock, name, h.counters, createCounter) - case "gauge": - getOrAdd(&h.lock, name, h.gauges, createGauge) - case "histogram": - getOrAdd(&h.lock, name, h.histograms, createHistogram) - case "updown": - getOrAdd(&h.lock, name, h.updowns, createUpdown) +func (h *LegacyMetrics) Register(metadata Metadata) { + h.Logger.Debug().Logf("metrics registering %s with name %s", metadata.Type, metadata.Name) + switch metadata.Type { + case Counter: + getOrAdd(&h.lock, metadata.Name, h.counters, createCounter) + case Gauge: + getOrAdd(&h.lock, metadata.Name, h.gauges, createGauge) + case Histogram: + getOrAdd(&h.lock, metadata.Name, h.histograms, createHistogram) + case UpDown: + getOrAdd(&h.lock, metadata.Name, h.updowns, createUpdown) default: - h.Logger.Debug().Logf("unsupported metric type %s", metricType) + h.Logger.Debug().Logf("unsupported metric type %s", metadata.Type) } } diff --git a/metrics/legacy_test.go b/metrics/legacy_test.go index 2ccca10971..4ae6a47c3e 100644 --- a/metrics/legacy_test.go +++ b/metrics/legacy_test.go @@ -133,7 +133,10 @@ func TestMetricsUpdown(t *testing.T) { Logger: &logger.NullLogger{}, } m.Start() - m.Register("foo", "updown") + m.Register(Metadata{ + Name: "foo", + Type: UpDown, + }) m.Up("foo") m.Up("foo") m.Down("foo") diff --git a/metrics/metrics.go b/metrics/metrics.go index 32fbf83258..8d021bbab6 100644 --- a/metrics/metrics.go +++ b/metrics/metrics.go @@ -34,7 +34,7 @@ import ( // StressRelief. type Metrics interface { // Register declares a metric; metricType should be one of counter, gauge, histogram, updown - Register(name string, metricType string) + Register(metadata Metadata) Increment(name string) // for counters Gauge(name string, val interface{}) // for gauges Count(name string, n interface{}) // for counters @@ -89,3 +89,47 @@ func PrefixMetricName(prefix string, name string) string { } return name } + +type Metadata struct { + Name string + Type MetricType + // Unit is the unit of the metric. It should follow the UCUM case-sensitive + // unit format. + Unit Unit + // Description is a human-readable description of the metric + Description string +} + +type MetricType int + +func (m MetricType) String() string { + switch m { + case Counter: + return "counter" + case Gauge: + return "gauge" + case Histogram: + return "histogram" + case UpDown: + return "updown" + } + return "unknown" +} + +const ( + Counter MetricType = iota + Gauge + Histogram + UpDown +) + +type Unit string + +// Units defined by OpenTelemetry. +const ( + Dimensionless Unit = "1" + Bytes Unit = "By" + Milliseconds Unit = "ms" + Microseconds Unit = "us" + Percent Unit = "%" +) diff --git a/metrics/metricsnamer.go b/metrics/metricsnamer.go index 9cabf5c5dd..7bdb263cde 100644 --- a/metrics/metricsnamer.go +++ b/metrics/metricsnamer.go @@ -1,5 +1,7 @@ package metrics +var _ Metrics = (*MetricsPrefixer)(nil) + // This wraps a Metrics object and is a Metrics object itself, but adds a prefix // to all uses of its name. The point is that we can have a singleton Metrics // object that collects and reports all metrics rather than 3-5 different @@ -24,8 +26,9 @@ func (p *MetricsPrefixer) Start() error { return nil } -func (p *MetricsPrefixer) Register(name string, metricType string) { - p.Metrics.Register(p.prefix+name, metricType) +func (p *MetricsPrefixer) Register(metadata Metadata) { + metadata.Name = p.prefix + metadata.Name + p.Metrics.Register(metadata) } func (p *MetricsPrefixer) Increment(name string) { diff --git a/metrics/mock.go b/metrics/mock.go index 7d73306ed3..937fe44bb1 100644 --- a/metrics/mock.go +++ b/metrics/mock.go @@ -2,6 +2,8 @@ package metrics import "sync" +var _ Metrics = (*MockMetrics)(nil) + // MockMetrics collects metrics that were registered and changed to allow tests to // verify expected behavior type MockMetrics struct { @@ -25,11 +27,11 @@ func (m *MockMetrics) Start() { m.Constants = make(map[string]float64) } -func (m *MockMetrics) Register(name string, metricType string) { +func (m *MockMetrics) Register(metadata Metadata) { m.lock.Lock() defer m.lock.Unlock() - m.Registrations[name] = metricType + m.Registrations[metadata.Name] = metadata.Type.String() } func (m *MockMetrics) Increment(name string) { m.lock.Lock() diff --git a/metrics/multi_metrics.go b/metrics/multi_metrics.go index d773f3f805..3f94c46781 100644 --- a/metrics/multi_metrics.go +++ b/metrics/multi_metrics.go @@ -6,6 +6,8 @@ import ( "github.com/honeycombio/refinery/config" ) +var _ Metrics = (*MultiMetrics)(nil) + // MultiMetrics is a metrics provider that sends metrics to at least one // underlying metrics provider (StoreMetrics). It can be configured to send // metrics to multiple providers at once. @@ -61,13 +63,13 @@ func (m *MultiMetrics) Children() []Metrics { return m.children } -func (m *MultiMetrics) Register(name string, metricType string) { +func (m *MultiMetrics) Register(metadata Metadata) { for _, ch := range m.children { - ch.Register(name, metricType) + ch.Register(metadata) } m.lock.Lock() defer m.lock.Unlock() - m.values[name] = 0 + m.values[metadata.Name] = 0 } func (m *MultiMetrics) Increment(name string) { // for counters diff --git a/metrics/multi_metrics_test.go b/metrics/multi_metrics_test.go index bd3b2ca339..ef530445cc 100644 --- a/metrics/multi_metrics_test.go +++ b/metrics/multi_metrics_test.go @@ -71,9 +71,18 @@ func TestMultiMetrics_Register(t *testing.T) { // that are important to StressRelief. mm, err := getAndStartMultiMetrics() assert.NoError(t, err) - mm.Register("updown", "updowncounter") - mm.Register("counter", "counter") - mm.Register("gauge", "gauge") + mm.Register(Metadata{ + Name: "updown", + Type: UpDown, + }) + mm.Register(Metadata{ + Name: "counter", + Type: Counter, + }) + mm.Register(Metadata{ + Name: "gauge", + Type: Gauge, + }) mm.Count("counter", 1) mm.Up("updown") diff --git a/metrics/nullmetrics.go b/metrics/nullmetrics.go index 028742f5c6..411b2ac617 100644 --- a/metrics/nullmetrics.go +++ b/metrics/nullmetrics.go @@ -1,17 +1,19 @@ package metrics +var _ Metrics = (*NullMetrics)(nil) + // NullMetrics discards all metrics type NullMetrics struct{} // Start initializes all metrics or resets all metrics to zero func (n *NullMetrics) Start() {} -func (n *NullMetrics) Register(name string, metricType string) {} -func (n *NullMetrics) Increment(name string) {} -func (n *NullMetrics) Gauge(name string, val interface{}) {} -func (n *NullMetrics) Count(name string, val interface{}) {} -func (n *NullMetrics) Histogram(name string, obs interface{}) {} -func (n *NullMetrics) Up(name string) {} -func (n *NullMetrics) Down(name string) {} -func (n *NullMetrics) Store(name string, value float64) {} -func (n *NullMetrics) Get(name string) (float64, bool) { return 0, true } +func (n *NullMetrics) Register(metadata Metadata) {} +func (n *NullMetrics) Increment(name string) {} +func (n *NullMetrics) Gauge(name string, val interface{}) {} +func (n *NullMetrics) Count(name string, val interface{}) {} +func (n *NullMetrics) Histogram(name string, obs interface{}) {} +func (n *NullMetrics) Up(name string) {} +func (n *NullMetrics) Down(name string) {} +func (n *NullMetrics) Store(name string, value float64) {} +func (n *NullMetrics) Get(name string) (float64, bool) { return 0, true } diff --git a/metrics/otel_metrics.go b/metrics/otel_metrics.go index bf5b67fdaf..d97728de5f 100644 --- a/metrics/otel_metrics.go +++ b/metrics/otel_metrics.go @@ -18,6 +18,8 @@ import ( "go.opentelemetry.io/otel/sdk/resource" ) +var _ Metrics = (*OTelMetrics)(nil) + // OTelMetrics sends metrics to Honeycomb using the OpenTelemetry protocol. One // particular thing to note is that OTel metrics treats histograms very // differently than Honeycomb's Legacy metrics. In particular, Legacy metrics @@ -182,64 +184,77 @@ func (o *OTelMetrics) Start() error { return nil } -func (o *OTelMetrics) Register(name string, metricType string) { +// Register creates a new metric with the given metadata +// and initialize it with zero value. +func (o *OTelMetrics) Register(metadata Metadata) { o.lock.Lock() defer o.lock.Unlock() ctx := context.Background() - switch metricType { - case "counter": - ctr, err := o.meter.Int64Counter(name) + + unit := string(metadata.Unit) + switch metadata.Type { + case Counter: + ctr, err := o.meter.Int64Counter(metadata.Name, + metric.WithUnit(unit), + metric.WithDescription(metadata.Description), + ) if err != nil { - o.Logger.Error().WithString("name", name).Logf("failed to create counter") + o.Logger.Error().WithString("name", metadata.Name).Logf("failed to create counter") return } - // Initialize the counter to 0 so that it will be reported - ctr.Add(ctx, 0) - o.counters[name] = ctr - case "gauge": + // Give the counter an initial value of 0 so that OTel will send it + ctr.Add(ctx, 0) + o.counters[metadata.Name] = ctr + case Gauge: var f metric.Float64Callback = func(_ context.Context, result metric.Float64Observer) error { // this callback is invoked from outside this function call, so we // need to Rlock when we read the values map. We don't know how long // Observe() takes, so we make a copy of the value and unlock before // calling Observe. o.lock.RLock() - v := o.values[name] + v := o.values[metadata.Name] o.lock.RUnlock() result.Observe(v) return nil } - g, err := o.meter.Float64ObservableGauge(name, + g, err := o.meter.Float64ObservableGauge(metadata.Name, + metric.WithUnit(unit), + metric.WithDescription(metadata.Description), metric.WithFloat64Callback(f), ) if err != nil { - o.Logger.Error().WithString("name", name).Logf("failed to create gauge") + o.Logger.Error().WithString("name", metadata.Name).Logf("failed to create gauge") return } - o.gauges[name] = g - case "histogram": - h, err := o.meter.Float64Histogram(name) + o.gauges[metadata.Name] = g + case Histogram: + h, err := o.meter.Float64Histogram(metadata.Name, + metric.WithUnit(unit), + metric.WithDescription(metadata.Description), + ) if err != nil { - o.Logger.Error().WithString("name", name).Logf("failed to create histogram") + o.Logger.Error().WithString("name", metadata.Name).Logf("failed to create histogram") return } - h.Record(ctx, 0) - o.histograms[name] = h - case "updown": - ud, err := o.meter.Int64UpDownCounter(name) + o.histograms[metadata.Name] = h + case UpDown: + ud, err := o.meter.Int64UpDownCounter(metadata.Name, + metric.WithUnit(unit), + metric.WithDescription(metadata.Description), + ) if err != nil { - o.Logger.Error().WithString("name", name).Logf("failed to create updown counter") + o.Logger.Error().WithString("name", metadata.Name).Logf("failed to create updown counter") return } - ud.Add(ctx, 0) - o.updowns[name] = ud + o.updowns[metadata.Name] = ud default: - o.Logger.Error().WithString("type", metricType).Logf("unknown metric type") + o.Logger.Error().WithString("type", metadata.Type.String()).Logf("unknown metric type") return } } diff --git a/metrics/prometheus.go b/metrics/prometheus.go index 8b54a11d62..e0c25725bd 100644 --- a/metrics/prometheus.go +++ b/metrics/prometheus.go @@ -13,6 +13,8 @@ import ( "github.com/honeycombio/refinery/logger" ) +var _ Metrics = (*PromMetrics)(nil) + type PromMetrics struct { Config config.Config `inject:""` Logger logger.Logger `inject:""` @@ -45,40 +47,44 @@ func (p *PromMetrics) Start() error { // Register takes a name and a metric type. The type should be one of "counter", // "gauge", or "histogram" -func (p *PromMetrics) Register(name string, metricType string) { +func (p *PromMetrics) Register(metadata Metadata) { p.lock.Lock() defer p.lock.Unlock() - newmet, exists := p.metrics[name] + newmet, exists := p.metrics[metadata.Name] // don't attempt to add the metric again as this will cause a panic if exists { return } - switch metricType { - case "counter": + help := metadata.Description + if help == "" { + help = metadata.Name + } + switch metadata.Type { + case Counter: newmet = promauto.NewCounter(prometheus.CounterOpts{ - Name: name, - Help: name, + Name: metadata.Name, + Help: help, }) - case "gauge", "updown": // updown is a special gauge + case Gauge, UpDown: // updown is a special gauge newmet = promauto.NewGauge(prometheus.GaugeOpts{ - Name: name, - Help: name, + Name: metadata.Name, + Help: help, }) - case "histogram": + case Histogram: newmet = promauto.NewHistogram(prometheus.HistogramOpts{ - Name: name, - Help: name, + Name: metadata.Name, + Help: help, // This is an attempt at a usable set of buckets for a wide range of metrics // 16 buckets, first upper bound of 1, each following upper bound is 4x the previous Buckets: prometheus.ExponentialBuckets(1, 4, 16), }) } - p.metrics[name] = newmet - p.values[name] = 0 + p.metrics[metadata.Name] = newmet + p.values[metadata.Name] = 0 } func (p *PromMetrics) Get(name string) (float64, bool) { diff --git a/metrics/prometheus_test.go b/metrics/prometheus_test.go index 6ff27073e7..eb889e61c0 100644 --- a/metrics/prometheus_test.go +++ b/metrics/prometheus_test.go @@ -19,9 +19,15 @@ func TestMultipleRegistrations(t *testing.T) { assert.NoError(t, err) - p.Register("test", "counter") - - p.Register("test", "counter") + p.Register(Metadata{ + Name: "test", + Type: Counter, + }) + + p.Register(Metadata{ + Name: "test", + Type: Counter, + }) } func TestRaciness(t *testing.T) { @@ -34,14 +40,20 @@ func TestRaciness(t *testing.T) { assert.NoError(t, err) - p.Register("race", "counter") + p.Register(Metadata{ + Name: "race", + Type: Counter, + }) // this loop modifying the metric registry and reading it to increment // a counter should not trigger a race condition for i := 0; i < 50; i++ { go func(j int) { metricName := fmt.Sprintf("metric%d", j) - p.Register(metricName, "counter") + p.Register(Metadata{ + Name: metricName, + Type: Counter, + }) }(i) go func(j int) { diff --git a/pubsub/pubsub_goredis.go b/pubsub/pubsub_goredis.go index 33a3a471e0..80d27e43e1 100644 --- a/pubsub/pubsub_goredis.go +++ b/pubsub/pubsub_goredis.go @@ -48,6 +48,11 @@ type GoRedisSubscription struct { // Ensure that GoRedisSubscription implements Subscription var _ Subscription = (*GoRedisSubscription)(nil) +var goredisPubSubMetrics = []metrics.Metadata{ + {Name: "redis_pubsub_published", Type: metrics.Counter, Unit: metrics.Dimensionless, Description: "Number of messages published to Redis PubSub"}, + {Name: "redis_pubsub_received", Type: metrics.Counter, Unit: metrics.Dimensionless, Description: "Number of messages received from Redis PubSub"}, +} + func (ps *GoRedisPubSub) Start() error { options := new(redis.UniversalOptions) var ( @@ -100,8 +105,9 @@ func (ps *GoRedisPubSub) Start() error { } } - ps.Metrics.Register("redis_pubsub_published", "counter") - ps.Metrics.Register("redis_pubsub_received", "counter") + for _, metric := range goredisPubSubMetrics { + ps.Metrics.Register(metric) + } ps.client = client ps.subs = make([]*GoRedisSubscription, 0) diff --git a/pubsub/pubsub_local.go b/pubsub/pubsub_local.go index ec9012aabf..28669facf2 100644 --- a/pubsub/pubsub_local.go +++ b/pubsub/pubsub_local.go @@ -31,14 +31,21 @@ type LocalSubscription struct { // Ensure that LocalSubscription implements Subscription var _ Subscription = (*LocalSubscription)(nil) +var localPubSubMetrics = []metrics.Metadata{ + {Name: "local_pubsub_published", Type: metrics.Counter, Unit: metrics.Dimensionless, Description: "The total number of messages sent via the local pubsub implementation"}, + {Name: "local_pubsub_received", Type: metrics.Counter, Unit: metrics.Dimensionless, Description: "The total number of messages received via the local pubsub implementation"}, +} + // Start initializes the LocalPubSub func (ps *LocalPubSub) Start() error { ps.topics = make(map[string][]*LocalSubscription) if ps.Metrics == nil { ps.Metrics = &metrics.NullMetrics{} } - ps.Metrics.Register("local_pubsub_published", "counter") - ps.Metrics.Register("local_pubsub_received", "counter") + + for _, metric := range localPubSubMetrics { + ps.Metrics.Register(metric) + } return nil } diff --git a/route/route.go b/route/route.go index 9d083dacd8..fe4fba1207 100644 --- a/route/route.go +++ b/route/route.go @@ -121,6 +121,11 @@ func (r *Router) SetVersion(ver string) { r.versionStr = ver } +var routerMetrics = []metrics.Metadata{ + {Name: "_router_proxied", Type: metrics.Counter, Unit: metrics.Dimensionless, Description: "the number of events proxied to another refinery"}, + {Name: "_router_event", Type: metrics.Counter, Unit: metrics.Dimensionless, Description: "the number of events received"}, +} + // LnS spins up the Listen and Serve portion of the router. A router is // initialized as being for either incoming traffic from clients or traffic from // a peer. They listen on different addresses so peer traffic can be @@ -145,13 +150,10 @@ func (r *Router) LnS(incomingOrPeer string) { return } - r.Metrics.Register(r.incomingOrPeer+"_router_proxied", "counter") - r.Metrics.Register(r.incomingOrPeer+"_router_event", "counter") - r.Metrics.Register(r.incomingOrPeer+"_router_batch", "counter") - r.Metrics.Register(r.incomingOrPeer+"_router_nonspan", "counter") - r.Metrics.Register(r.incomingOrPeer+"_router_span", "counter") - r.Metrics.Register(r.incomingOrPeer+"_router_peer", "counter") - r.Metrics.Register(r.incomingOrPeer+"_router_dropped", "counter") + for _, metric := range routerMetrics { + metric.Name = r.incomingOrPeer + metric.Name + r.Metrics.Register(metric) + } muxxer := mux.NewRouter() diff --git a/sample/deterministic.go b/sample/deterministic.go index c51500ddfe..4a52d611d1 100644 --- a/sample/deterministic.go +++ b/sample/deterministic.go @@ -29,11 +29,16 @@ func (d *DeterministicSampler) Start() error { d.Logger.Debug().Logf("Starting DeterministicSampler") defer func() { d.Logger.Debug().Logf("Finished starting DeterministicSampler") }() d.sampleRate = d.Config.SampleRate - d.prefix = "deterministic_" + d.prefix = "deterministic" if d.Metrics == nil { d.Metrics = &metrics.NullMetrics{} } + for _, metric := range samplerMetrics { + metric.Name = d.prefix + metric.Name + d.Metrics.Register(metric) + } + // Get the actual upper bound - the largest possible value divided by // the sample rate. In the case where the sample rate is 1, this should // sample every value. @@ -50,9 +55,9 @@ func (d *DeterministicSampler) GetSampleRate(trace *types.Trace) (rate uint, kee v := binary.BigEndian.Uint32(sum[:4]) shouldKeep := v <= d.upperBound if shouldKeep { - d.Metrics.Increment(d.prefix + "num_kept") + d.Metrics.Increment(d.prefix + "_num_kept") } else { - d.Metrics.Increment(d.prefix + "num_dropped") + d.Metrics.Increment(d.prefix + "_num_dropped") } return uint(d.sampleRate), shouldKeep, "deterministic/chance", "" diff --git a/sample/dynamic.go b/sample/dynamic.go index 9d39bb1c27..92987e650c 100644 --- a/sample/dynamic.go +++ b/sample/dynamic.go @@ -21,12 +21,12 @@ type DynamicSampler struct { clearFrequency config.Duration maxKeys int prefix string - lastMetrics map[string]int64 key *traceKey keyFields []string - dynsampler dynsampler.Sampler + dynsampler dynsampler.Sampler + metricsRecorder dynsamplerMetricsRecorder } func (d *DynamicSampler) Start() error { @@ -42,9 +42,9 @@ func (d *DynamicSampler) Start() error { if d.maxKeys == 0 { d.maxKeys = 500 } - d.prefix = "dynamic_" d.keyFields = d.Config.GetSamplingFields() + d.prefix = "dynamic" // spin up the actual dynamic sampler d.dynsampler = &dynsampler.AvgSampleRate{ GoalSampleRate: int(d.sampleRate), @@ -53,14 +53,12 @@ func (d *DynamicSampler) Start() error { } d.dynsampler.Start() - // Register statistics this package will produce - d.lastMetrics = d.dynsampler.GetMetrics(d.prefix) - for name := range d.lastMetrics { - d.Metrics.Register(name, getMetricType(name)) + // Register statistics from the dynsampler-go package + d.metricsRecorder = dynsamplerMetricsRecorder{ + met: d.Metrics, + prefix: d.prefix, } - d.Metrics.Register(d.prefix+"num_dropped", "counter") - d.Metrics.Register(d.prefix+"num_kept", "counter") - d.Metrics.Register(d.prefix+"sample_rate", "histogram") + d.metricsRecorder.RegisterMetrics(d.dynsampler) return nil } @@ -83,23 +81,8 @@ func (d *DynamicSampler) GetSampleRate(trace *types.Trace) (rate uint, keep bool "trace_id": trace.TraceID, "span_count": count, }).Logf("got sample rate and decision") - if shouldKeep { - d.Metrics.Increment(d.prefix + "num_kept") - } else { - d.Metrics.Increment(d.prefix + "num_dropped") - } - d.Metrics.Histogram(d.prefix+"sample_rate", float64(rate)) - for name, val := range d.dynsampler.GetMetrics(d.prefix) { - switch getMetricType(name) { - case "counter": - delta := val - d.lastMetrics[name] - d.Metrics.Count(name, delta) - d.lastMetrics[name] = val - case "gauge": - d.Metrics.Gauge(name, val) - } - } - return rate, shouldKeep, "dynamic", key + d.metricsRecorder.RecordMetrics(d.dynsampler, shouldKeep, rate) + return rate, shouldKeep, d.prefix, key } func (d *DynamicSampler) GetKeyFields() []string { diff --git a/sample/dynamic_ema.go b/sample/dynamic_ema.go index 758ce23f55..6c340bccaf 100644 --- a/sample/dynamic_ema.go +++ b/sample/dynamic_ema.go @@ -30,7 +30,8 @@ type EMADynamicSampler struct { key *traceKey keyFields []string - dynsampler dynsampler.Sampler + dynsampler *dynsampler.EMASampleRate + metricsRecorder *dynsamplerMetricsRecorder } func (d *EMADynamicSampler) Start() error { @@ -47,7 +48,7 @@ func (d *EMADynamicSampler) Start() error { if d.maxKeys == 0 { d.maxKeys = 500 } - d.prefix = "emadynamic_" + d.prefix = "emadynamic" d.keyFields = d.Config.GetSamplingFields() // spin up the actual dynamic sampler @@ -63,13 +64,12 @@ func (d *EMADynamicSampler) Start() error { d.dynsampler.Start() // Register statistics this package will produce - d.lastMetrics = d.dynsampler.GetMetrics(d.prefix) - for name := range d.lastMetrics { - d.Metrics.Register(name, getMetricType(name)) + d.metricsRecorder = &dynsamplerMetricsRecorder{ + prefix: d.prefix, + met: d.Metrics, } - d.Metrics.Register(d.prefix+"num_dropped", "counter") - d.Metrics.Register(d.prefix+"num_kept", "counter") - d.Metrics.Register(d.prefix+"sample_rate", "histogram") + + d.metricsRecorder.RegisterMetrics(d.dynsampler) return nil } @@ -91,23 +91,9 @@ func (d *EMADynamicSampler) GetSampleRate(trace *types.Trace) (rate uint, keep b "trace_id": trace.TraceID, "span_count": count, }).Logf("got sample rate and decision") - if shouldKeep { - d.Metrics.Increment(d.prefix + "num_kept") - } else { - d.Metrics.Increment(d.prefix + "num_dropped") - } - d.Metrics.Histogram(d.prefix+"sample_rate", float64(rate)) - for name, val := range d.dynsampler.GetMetrics(d.prefix) { - switch getMetricType(name) { - case "counter": - delta := val - d.lastMetrics[name] - d.Metrics.Count(name, delta) - d.lastMetrics[name] = val - case "gauge": - d.Metrics.Gauge(name, val) - } - } - return rate, shouldKeep, "emadynamic", key + d.metricsRecorder.RecordMetrics(d.dynsampler, shouldKeep, rate) + + return rate, shouldKeep, d.prefix, key } func (d *EMADynamicSampler) GetKeyFields() []string { diff --git a/sample/ema_throughput.go b/sample/ema_throughput.go index e3b3bd7ce9..d7c53b8f7f 100644 --- a/sample/ema_throughput.go +++ b/sample/ema_throughput.go @@ -28,12 +28,12 @@ type EMAThroughputSampler struct { burstDetectionDelay uint maxKeys int prefix string - lastMetrics map[string]int64 key *traceKey keyFields []string - dynsampler *dynsampler.EMAThroughput + dynsampler *dynsampler.EMAThroughput + metricsRecorder *dynsamplerMetricsRecorder } func (d *EMAThroughputSampler) Start() error { @@ -55,7 +55,7 @@ func (d *EMAThroughputSampler) Start() error { if d.maxKeys == 0 { d.maxKeys = 500 } - d.prefix = "emathroughput_" + d.prefix = "emathroughput" d.keyFields = d.Config.GetSamplingFields() // spin up the actual dynamic sampler @@ -72,13 +72,11 @@ func (d *EMAThroughputSampler) Start() error { d.dynsampler.Start() // Register statistics this package will produce - d.lastMetrics = d.dynsampler.GetMetrics(d.prefix) - for name := range d.lastMetrics { - d.Metrics.Register(name, getMetricType(name)) + d.metricsRecorder = &dynsamplerMetricsRecorder{ + prefix: d.prefix, + met: d.Metrics, } - d.Metrics.Register(d.prefix+"num_dropped", "counter") - d.Metrics.Register(d.prefix+"num_kept", "counter") - d.Metrics.Register(d.prefix+"sample_rate", "histogram") + d.metricsRecorder.RegisterMetrics(d.dynsampler) return nil } @@ -108,23 +106,8 @@ func (d *EMAThroughputSampler) GetSampleRate(trace *types.Trace) (rate uint, kee "trace_id": trace.TraceID, "span_count": count, }).Logf("got sample rate and decision") - if shouldKeep { - d.Metrics.Increment(d.prefix + "num_kept") - } else { - d.Metrics.Increment(d.prefix + "num_dropped") - } - d.Metrics.Histogram(d.prefix+"sample_rate", float64(rate)) - for name, val := range d.dynsampler.GetMetrics(d.prefix) { - switch getMetricType(name) { - case "counter": - delta := val - d.lastMetrics[name] - d.Metrics.Count(name, delta) - d.lastMetrics[name] = val - case "gauge": - d.Metrics.Gauge(name, val) - } - } - return rate, shouldKeep, "emathroughput", key + d.metricsRecorder.RecordMetrics(d.dynsampler, shouldKeep, rate) + return rate, shouldKeep, d.prefix, key } func (d *EMAThroughputSampler) GetKeyFields() []string { diff --git a/sample/rules.go b/sample/rules.go index cd87f8f3a8..14d5b83679 100644 --- a/sample/rules.go +++ b/sample/rules.go @@ -25,15 +25,20 @@ type RulesBasedSampler struct { const RootPrefix = "root." +var ruleBasedSamplerMetrics = []metrics.Metadata{ + {Name: "_num_dropped_by_drop_rule", Type: metrics.Counter, Unit: metrics.Dimensionless, Description: "Number of traces dropped by the drop rule"}, +} + func (s *RulesBasedSampler) Start() error { s.Logger.Debug().Logf("Starting RulesBasedSampler") defer func() { s.Logger.Debug().Logf("Finished starting RulesBasedSampler") }() - s.prefix = "rulesbased_" + s.prefix = "rulesbased" - s.Metrics.Register(s.prefix+"num_dropped", "counter") - s.Metrics.Register(s.prefix+"num_dropped_by_drop_rule", "counter") - s.Metrics.Register(s.prefix+"num_kept", "counter") - s.Metrics.Register(s.prefix+"sample_rate", "histogram") + ruleBasedSamplerMetrics = append(ruleBasedSamplerMetrics, samplerMetrics...) + for _, metric := range ruleBasedSamplerMetrics { + metric.Name = s.prefix + metric.Name + s.Metrics.Register(metric) + } s.samplers = make(map[string]Sampler) s.keyFields = s.Config.GetSamplingFields() @@ -137,16 +142,16 @@ func (s *RulesBasedSampler) GetSampleRate(trace *types.Trace) (rate uint, keep b rate = uint(rule.SampleRate) keep = !rule.Drop && rule.SampleRate > 0 && rand.Intn(rule.SampleRate) == 0 reason += rule.Name - s.Metrics.Histogram(s.prefix+"sample_rate", float64(rate)) + s.Metrics.Histogram(s.prefix+"_sample_rate", float64(rate)) } if keep { - s.Metrics.Increment(s.prefix + "num_kept") + s.Metrics.Increment(s.prefix + "_num_kept") } else { - s.Metrics.Increment(s.prefix + "num_dropped") + s.Metrics.Increment(s.prefix + "_num_dropped") if rule.Drop { // If we dropped because of an explicit drop rule, then increment that too. - s.Metrics.Increment(s.prefix + "num_dropped_by_drop_rule") + s.Metrics.Increment(s.prefix + "_num_dropped_by_drop_rule") } } logger.WithFields(map[string]interface{}{ diff --git a/sample/sample.go b/sample/sample.go index 0e7d6e9c63..ef5bba5752 100644 --- a/sample/sample.go +++ b/sample/sample.go @@ -5,6 +5,7 @@ import ( "os" "strings" + dynsampler "github.com/honeycombio/dynsampler-go" "github.com/honeycombio/refinery/config" "github.com/honeycombio/refinery/internal/peer" "github.com/honeycombio/refinery/logger" @@ -107,9 +108,58 @@ func (s *SamplerFactory) GetSamplerImplementationForKey(samplerKey string, isLeg return sampler } -func getMetricType(name string) string { +var samplerMetrics = []metrics.Metadata{ + {Name: "_num_dropped", Type: metrics.Counter, Unit: metrics.Dimensionless, Description: "Number of traces dropped by configured sampler"}, + {Name: "_num_kept", Type: metrics.Counter, Unit: metrics.Dimensionless, Description: "Number of traces kept by configured sampler"}, + {Name: "_sample_rate", Type: metrics.Histogram, Unit: metrics.Dimensionless, Description: "Sample rate for traces"}, +} + +func getMetricType(name string) metrics.MetricType { if strings.HasSuffix(name, "_count") { - return "counter" + return metrics.Counter + } + return metrics.Gauge +} + +type dynsamplerMetricsRecorder struct { + prefix string + lastMetrics map[string]int64 + met metrics.Metrics +} + +func (d *dynsamplerMetricsRecorder) RegisterMetrics(sampler dynsampler.Sampler) { + // Register statistics this package will produce + d.lastMetrics = sampler.GetMetrics(d.prefix + "_") + for name := range d.lastMetrics { + d.met.Register(metrics.Metadata{ + Name: name, + Type: getMetricType(name), + }) + } + + for _, metric := range samplerMetrics { + metric.Name = d.prefix + metric.Name + d.met.Register(metric) + } + +} + +func (d *dynsamplerMetricsRecorder) RecordMetrics(sampler dynsampler.Sampler, kept bool, rate uint) { + for name, val := range sampler.GetMetrics(d.prefix + "_") { + switch getMetricType(name) { + case metrics.Counter: + delta := val - d.lastMetrics[name] + d.met.Count(name, delta) + d.lastMetrics[name] = val + case metrics.Gauge: + d.met.Gauge(name, val) + } + } + + if kept { + d.met.Increment(d.prefix + "_num_kept") + } else { + d.met.Increment(d.prefix + "_num_dropped") } - return "gauge" + d.met.Histogram(d.prefix+"_sample_rate", float64(rate)) } diff --git a/sample/totalthroughput.go b/sample/totalthroughput.go index 9f47c7b41d..9ee17bd21e 100644 --- a/sample/totalthroughput.go +++ b/sample/totalthroughput.go @@ -23,12 +23,12 @@ type TotalThroughputSampler struct { clearFrequency config.Duration maxKeys int prefix string - lastMetrics map[string]int64 key *traceKey keyFields []string - dynsampler *dynsampler.TotalThroughput + dynsampler *dynsampler.TotalThroughput + metricsRecorder *dynsamplerMetricsRecorder } func (d *TotalThroughputSampler) Start() error { @@ -52,7 +52,7 @@ func (d *TotalThroughputSampler) Start() error { if d.maxKeys == 0 { d.maxKeys = 500 } - d.prefix = "totalthroughput_" + d.prefix = "totalthroughput" d.keyFields = d.Config.GetSamplingFields() // spin up the actual dynamic sampler @@ -64,14 +64,11 @@ func (d *TotalThroughputSampler) Start() error { d.dynsampler.Start() // Register statistics this package will produce - d.lastMetrics = d.dynsampler.GetMetrics(d.prefix) - for name := range d.lastMetrics { - d.Metrics.Register(name, getMetricType(name)) + d.metricsRecorder = &dynsamplerMetricsRecorder{ + prefix: d.prefix, + met: d.Metrics, } - d.Metrics.Register(d.prefix+"num_dropped", "counter") - d.Metrics.Register(d.prefix+"num_kept", "counter") - d.Metrics.Register(d.prefix+"sample_rate", "histogram") - + d.metricsRecorder.RegisterMetrics(d.dynsampler) return nil } @@ -100,23 +97,9 @@ func (d *TotalThroughputSampler) GetSampleRate(trace *types.Trace) (rate uint, k "trace_id": trace.TraceID, "span_count": count, }).Logf("got sample rate and decision") - if shouldKeep { - d.Metrics.Increment(d.prefix + "num_kept") - } else { - d.Metrics.Increment(d.prefix + "num_dropped") - } - d.Metrics.Histogram(d.prefix+"sample_rate", float64(rate)) - for name, val := range d.dynsampler.GetMetrics(d.prefix) { - switch getMetricType(name) { - case "counter": - delta := val - d.lastMetrics[name] - d.Metrics.Count(name, delta) - d.lastMetrics[name] = val - case "gauge": - d.Metrics.Gauge(name, val) - } - } - return rate, shouldKeep, "totalthroughput", key + + d.metricsRecorder.RecordMetrics(d.dynsampler, shouldKeep, rate) + return rate, shouldKeep, d.prefix, key } func (d *TotalThroughputSampler) GetKeyFields() []string { diff --git a/sample/windowed_throughput.go b/sample/windowed_throughput.go index 51220da8f1..3a0117beb1 100644 --- a/sample/windowed_throughput.go +++ b/sample/windowed_throughput.go @@ -24,12 +24,12 @@ type WindowedThroughputSampler struct { useClusterSize bool maxKeys int prefix string - lastMetrics map[string]int64 key *traceKey keyFields []string - dynsampler *dynsampler.WindowedThroughput + dynsampler *dynsampler.WindowedThroughput + metricsRecorder *dynsamplerMetricsRecorder } func (d *WindowedThroughputSampler) Start() error { @@ -47,7 +47,7 @@ func (d *WindowedThroughputSampler) Start() error { if d.maxKeys == 0 { d.maxKeys = 500 } - d.prefix = "windowedthroughput_" + d.prefix = "windowedthroughput" d.keyFields = d.Config.GetSamplingFields() // spin up the actual dynamic sampler @@ -60,14 +60,11 @@ func (d *WindowedThroughputSampler) Start() error { d.dynsampler.Start() // Register statistics this package will produce - d.lastMetrics = d.dynsampler.GetMetrics(d.prefix) - for name := range d.lastMetrics { - d.Metrics.Register(name, getMetricType(name)) + d.metricsRecorder = &dynsamplerMetricsRecorder{ + prefix: d.prefix, + met: d.Metrics, } - d.Metrics.Register(d.prefix+"num_dropped", "counter") - d.Metrics.Register(d.prefix+"num_kept", "counter") - d.Metrics.Register(d.prefix+"sample_rate", "histogram") - + d.metricsRecorder.RegisterMetrics(d.dynsampler) return nil } @@ -96,23 +93,9 @@ func (d *WindowedThroughputSampler) GetSampleRate(trace *types.Trace) (rate uint "trace_id": trace.TraceID, "span_count": count, }).Logf("got sample rate and decision") - if shouldKeep { - d.Metrics.Increment(d.prefix + "num_kept") - } else { - d.Metrics.Increment(d.prefix + "num_dropped") - } - d.Metrics.Histogram(d.prefix+"sample_rate", float64(rate)) - for name, val := range d.dynsampler.GetMetrics(d.prefix) { - switch getMetricType(name) { - case "counter": - delta := val - d.lastMetrics[name] - d.Metrics.Count(name, delta) - d.lastMetrics[name] = val - case "gauge": - d.Metrics.Gauge(name, val) - } - } - return rate, shouldKeep, "Windowedthroughput", key + d.metricsRecorder.RecordMetrics(d.dynsampler, shouldKeep, rate) + + return rate, shouldKeep, d.prefix, key } func (d *WindowedThroughputSampler) GetKeyFields() []string { return d.keyFields diff --git a/sample/windowed_throughput_test.go b/sample/windowed_throughput_test.go index 9fb7d9894b..ec81bb61ad 100644 --- a/sample/windowed_throughput_test.go +++ b/sample/windowed_throughput_test.go @@ -45,7 +45,7 @@ func TestWindowedThroughputAddSampleRateKeyToTrace(t *testing.T) { assert.Len(t, spans, spanCount, "should have the same number of spans as input") assert.Equal(t, uint(1), rate, "sample rate should be 1") - assert.Equal(t, "Windowedthroughput", reason) + assert.Equal(t, "windowedthroughput", reason) assert.Equal(t, "4•,200•,true•,/{slug}/fun•,", key) } diff --git a/transmit/transmit.go b/transmit/transmit.go index 3a3cad515b..c314299520 100644 --- a/transmit/transmit.go +++ b/transmit/transmit.go @@ -52,6 +52,14 @@ func NewDefaultTransmission(client *libhoney.Client, m metrics.Metrics, name str return &DefaultTransmission{LibhClient: client, Metrics: m, Name: name} } +var transmissionMetrics = []metrics.Metadata{ + {Name: counterEnqueueErrors, Type: metrics.Counter, Unit: metrics.Dimensionless, Description: "The number of errors encountered when enqueueing events"}, + {Name: counterResponse20x, Type: metrics.Counter, Unit: metrics.Dimensionless, Description: "The number of successful responses from Honeycomb"}, + {Name: counterResponseErrors, Type: metrics.Counter, Unit: metrics.Dimensionless, Description: "The number of errors encountered when sending events to Honeycomb"}, + {Name: updownQueuedItems, Type: metrics.UpDown, Unit: metrics.Dimensionless, Description: "The number of events queued for transmission to Honeycomb"}, + {Name: histogramQueueTime, Type: metrics.Histogram, Unit: metrics.Microseconds, Description: "The time spent in the queue before being sent to Honeycomb"}, +} + func (d *DefaultTransmission) Start() error { d.Logger.Debug().Logf("Starting DefaultTransmission: %s type", d.Name) defer func() { d.Logger.Debug().Logf("Finished starting DefaultTransmission: %s type", d.Name) }() @@ -140,11 +148,9 @@ func (d *DefaultTransmission) Flush() { // RegisterMetrics registers the metrics used by the DefaultTransmission. // it should be called after the metrics object has been created. func (d *DefaultTransmission) RegisterMetrics() { - d.Metrics.Register(counterEnqueueErrors, "counter") - d.Metrics.Register(counterResponse20x, "counter") - d.Metrics.Register(counterResponseErrors, "counter") - d.Metrics.Register(updownQueuedItems, "updown") - d.Metrics.Register(histogramQueueTime, "histogram") + for _, m := range transmissionMetrics { + d.Metrics.Register(m) + } } func (d *DefaultTransmission) Stop() error { From eb061f072cc7ddf60345a8d2ed75af861e75dacb Mon Sep 17 00:00:00 2001 From: Yingrong Zhao <22300958+VinozzZ@users.noreply.github.com> Date: Thu, 26 Sep 2024 15:36:14 -0400 Subject: [PATCH 11/25] feat: generate metrics documentation (#1351) ## Which problem is this PR solving? This PR implements the logic to parse out all metrics defined in the Refinery code base and generates a markdown table for them. - part 2 of #1152 ## Short description of the changes - add two new commands to the `convert` tool to generate the metricsMeta.yaml file and a metrics.md file --- go.mod | 12 +- go.sum | 18 +- metrics.md | 77 ++++++++ tools/convert/Makefile | 20 +- tools/convert/main.go | 44 +++++ tools/convert/metrics.go | 157 +++++++++++++++ tools/convert/metricsMeta.yaml | 280 +++++++++++++++++++++++++++ tools/convert/templates/metrics.tmpl | 10 + 8 files changed, 608 insertions(+), 10 deletions(-) create mode 100644 metrics.md create mode 100644 tools/convert/metrics.go create mode 100644 tools/convert/metricsMeta.yaml create mode 100644 tools/convert/templates/metrics.tmpl diff --git a/go.mod b/go.mod index 6525708883..cc19f7fbea 100644 --- a/go.mod +++ b/go.mod @@ -48,6 +48,11 @@ require ( gopkg.in/yaml.v3 v3.0.1 ) +require ( + golang.org/x/mod v0.21.0 // indirect + golang.org/x/sync v0.8.0 // indirect +) + require ( github.com/beorn7/perks v1.0.1 // indirect github.com/cenkalti/backoff/v4 v4.3.0 // indirect @@ -73,9 +78,10 @@ require ( github.com/tidwall/pretty v1.2.0 // indirect github.com/vmihailenco/tagparser/v2 v2.0.0 // indirect go.uber.org/multierr v1.11.0 // indirect - golang.org/x/net v0.28.0 // indirect - golang.org/x/sys v0.24.0 // indirect - golang.org/x/text v0.17.0 // indirect + golang.org/x/net v0.29.0 // indirect + golang.org/x/sys v0.25.0 // indirect + golang.org/x/text v0.18.0 // indirect + golang.org/x/tools v0.25.0 google.golang.org/genproto/googleapis/api v0.0.0-20240822170219-fc7c04adadcd // indirect google.golang.org/genproto/googleapis/rpc v0.0.0-20240822170219-fc7c04adadcd // indirect ) diff --git a/go.sum b/go.sum index d06c6f99cd..a27c397e87 100644 --- a/go.sum +++ b/go.sum @@ -160,13 +160,19 @@ go.uber.org/multierr v1.11.0 h1:blXXJkSxSSfBVBlC76pxqeO+LN3aDfLQo+309xJstO0= go.uber.org/multierr v1.11.0/go.mod h1:20+QtiLqy0Nd6FdQB9TLXag12DsQkrbs3htMFfDN80Y= golang.org/x/exp v0.0.0-20231127185646-65229373498e h1:Gvh4YaCaXNs6dKTlfgismwWZKyjVZXwOPfIyUaqU3No= golang.org/x/exp v0.0.0-20231127185646-65229373498e/go.mod h1:iRJReGqOEeBhDZGkGbynYwcHlctCvnjTYIamk7uXpHI= -golang.org/x/net v0.28.0 h1:a9JDOJc5GMUJ0+UDqmLT86WiEy7iWyIhz8gz8E4e5hE= -golang.org/x/net v0.28.0/go.mod h1:yqtgsTWOOnlGLG9GFRrK3++bGOUEkNBoHZc8MEDWPNg= +golang.org/x/mod v0.21.0 h1:vvrHzRwRfVKSiLrG+d4FMl/Qi4ukBCE6kZlTUkDYRT0= +golang.org/x/mod v0.21.0/go.mod h1:6SkKJ3Xj0I0BrPOZoBy3bdMptDDU9oJrpohJ3eWZ1fY= +golang.org/x/net v0.29.0 h1:5ORfpBpCs4HzDYoodCDBbwHzdR5UrLBZ3sOnUJmFoHo= +golang.org/x/net v0.29.0/go.mod h1:gLkgy8jTGERgjzMic6DS9+SP0ajcu6Xu3Orq/SpETg0= +golang.org/x/sync v0.8.0 h1:3NFvSEYkUoMifnESzZl15y791HH1qU2xm6eCJU5ZPXQ= +golang.org/x/sync v0.8.0/go.mod h1:Czt+wKu1gCyEFDUtn0jG5QVvpJ6rzVqr5aXyt9drQfk= golang.org/x/sys v0.0.0-20220715151400-c0bba94af5f8/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= -golang.org/x/sys v0.24.0 h1:Twjiwq9dn6R1fQcyiK+wQyHWfaz/BJB+YIpzU/Cv3Xg= -golang.org/x/sys v0.24.0/go.mod h1:/VUhepiaJMQUp4+oa/7Zr1D23ma6VTLIYjOOTFZPUcA= -golang.org/x/text v0.17.0 h1:XtiM5bkSOt+ewxlOE/aE/AKEHibwj/6gvWMl9Rsh0Qc= -golang.org/x/text v0.17.0/go.mod h1:BuEKDfySbSR4drPmRPG/7iBdf8hvFMuRexcpahXilzY= +golang.org/x/sys v0.25.0 h1:r+8e+loiHxRqhXVl6ML1nO3l1+oFoWbnlu2Ehimmi34= +golang.org/x/sys v0.25.0/go.mod h1:/VUhepiaJMQUp4+oa/7Zr1D23ma6VTLIYjOOTFZPUcA= +golang.org/x/text v0.18.0 h1:XvMDiNzPAl0jr17s6W9lcaIhGUfUORdGCNsuLmPG224= +golang.org/x/text v0.18.0/go.mod h1:BuEKDfySbSR4drPmRPG/7iBdf8hvFMuRexcpahXilzY= +golang.org/x/tools v0.25.0 h1:oFU9pkj/iJgs+0DT+VMHrx+oBKs/LJMV+Uvg78sl+fE= +golang.org/x/tools v0.25.0/go.mod h1:/vtpO8WL1N9cQC3FN5zPqb//fRXskFHbLKk4OW1Q7rg= google.golang.org/genproto/googleapis/api v0.0.0-20240822170219-fc7c04adadcd h1:BBOTEWLuuEGQy9n1y9MhVJ9Qt0BDu21X8qZs71/uPZo= google.golang.org/genproto/googleapis/api v0.0.0-20240822170219-fc7c04adadcd/go.mod h1:fO8wJzT2zbQbAjbIoos1285VfEIYKDDY+Dt+WpTkh6g= google.golang.org/genproto/googleapis/rpc v0.0.0-20240822170219-fc7c04adadcd h1:6TEm2ZxXoQmFWFlt1vNxvVOa1Q0dXFQD1m/rYjXmS0E= diff --git a/metrics.md b/metrics.md new file mode 100644 index 0000000000..71d7468e2a --- /dev/null +++ b/metrics.md @@ -0,0 +1,77 @@ +# Metrics Documentation +# Automatically generated on 2024-09-26 at 18:58:58 UTC + +This document contains the description of various metrics used in the system. + +| Name | Type | Unit | Description | +|------|------|------|-------------| +| collect_cache_buffer_overrun | Counter | Dimensionless | The number of times the trace overwritten in the circular buffer has not yet been sent | +| collect_cache_capacity | Gauge | Dimensionless | The number of traces that can be stored in the cache | +| collect_cache_entries | Histogram | Dimensionless | The number of traces currently stored in the cache | +| cuckoo_current_capacity | Gauge | Dimensionless | current capacity of the cuckoo filter | +| cuckoo_future_load_factor | Gauge | Percent | the fraction of slots occupied in the future cuckoo filter | +| cuckoo_current_load_factor | Gauge | Percent | the fraction of slots occupied in the current cuckoo filter | +| cache_recent_dropped_traces | Gauge | Dimensionless | the current size of the most recent dropped trace cache | +| collect_sent_reasons_cache_entries | Histogram | Dimensionless | Number of entries in the sent reasons cache | +| is_ready | Gauge | Dimensionless | Whether the system is ready to receive traffic | +| is_alive | Gauge | Dimensionless | Whether the system is alive and reporting in | +| redis_pubsub_published | Counter | Dimensionless | Number of messages published to Redis PubSub | +| redis_pubsub_received | Counter | Dimensionless | Number of messages received from Redis PubSub | +| local_pubsub_published | Counter | Dimensionless | The total number of messages sent via the local pubsub implementation | +| local_pubsub_received | Counter | Dimensionless | The total number of messages received via the local pubsub implementation | +| num_file_peers | Gauge | Dimensionless | Number of peers in the file peer list | +| num_peers | Gauge | Dimensionless | the active number of peers in the cluster | +| peer_hash | Gauge | Dimensionless | the hash of the current list of peers | +| peer_messages | Counter | Dimensionless | the number of messages received by the peers service | +| _num_dropped_by_drop_rule | Counter | Dimensionless | Number of traces dropped by the drop rule | +| _num_dropped | Counter | Dimensionless | Number of traces dropped by configured sampler | +| _num_kept | Counter | Dimensionless | Number of traces kept by configured sampler | +| _sample_rate | Histogram | Dimensionless | Sample rate for traces | +| enqueue_errors | Counter | Dimensionless | The number of errors encountered when enqueueing events | +| response_20x | Counter | Dimensionless | The number of successful responses from Honeycomb | +| response_errors | Counter | Dimensionless | The number of errors encountered when sending events to Honeycomb | +| queued_items | UpDown | Dimensionless | The number of events queued for transmission to Honeycomb | +| queue_time | Histogram | Microseconds | The time spent in the queue before being sent to Honeycomb | +| trace_duration_ms | Histogram | Milliseconds | time taken to process a trace from arrival to send | +| trace_span_count | Histogram | Dimensionless | number of spans in a trace | +| collector_incoming_queue | Histogram | Dimensionless | number of spans currently in the incoming queue | +| collector_peer_queue_length | Gauge | Dimensionless | number of spans in the peer queue | +| collector_incoming_queue_length | Gauge | Dimensionless | number of spans in the incoming queue | +| collector_peer_queue | Histogram | Dimensionless | number of spans currently in the peer queue | +| collector_cache_size | Gauge | Dimensionless | number of traces currently stored in the trace cache | +| memory_heap_allocation | Gauge | Bytes | current heap allocation | +| span_received | Counter | Dimensionless | number of spans received by the collector | +| span_processed | Counter | Dimensionless | number of spans processed by the collector | +| spans_waiting | UpDown | Dimensionless | number of spans waiting to be processed by the collector | +| trace_sent_cache_hit | Counter | Dimensionless | number of late spans received for traces that have already been sent | +| trace_accepted | Counter | Dimensionless | number of new traces received by the collector | +| trace_send_kept | Counter | Dimensionless | number of traces that has been kept | +| trace_send_dropped | Counter | Dimensionless | number of traces that has been dropped | +| trace_send_has_root | Counter | Dimensionless | number of kept traces that have a root span | +| trace_send_no_root | Counter | Dimensionless | number of kept traces that do not have a root span | +| trace_forwarded_on_peer_change | Gauge | Dimensionless | number of traces forwarded due to peer membership change | +| trace_redistribution_count | Gauge | Dimensionless | number of traces redistributed due to peer membership change | +| trace_send_on_shutdown | Counter | Dimensionless | number of traces sent during shutdown | +| trace_forwarded_on_shutdown | Counter | Dimensionless | number of traces forwarded during shutdown | +| trace_send_got_root | Counter | Dimensionless | number of traces that are ready for decision due to root span arrival | +| trace_send_expired | Counter | Dimensionless | number of traces that are ready for decision due to TraceTimeout or SendDelay | +| trace_send_span_limit | Counter | Dimensionless | number of traces that are ready for decision due to span limit | +| trace_send_ejected_full | Counter | Dimensionless | number of traces that are ready for decision due to cache capacity overrun | +| trace_send_ejected_memsize | Counter | Dimensionless | number of traces that are ready for decision due to memory overrun | +| trace_send_late_span | Counter | Dimensionless | number of spans that are sent due to late span arrival | +| dropped_from_stress | Counter | Dimensionless | number of traces dropped due to stress relief | +| cluster_stress_level | Gauge | Dimensionless | The overall stress level of the cluster | +| individual_stress_level | Gauge | Dimensionless | The stress level of the individual node | +| stress_level | Gauge | Dimensionless | The stress level that's being used to determine whether to activate stress relief | +| stress_relief_activated | Gauge | Dimensionless | Whether stress relief is currently activated | +| _router_proxied | Counter | Dimensionless | the number of events proxied to another refinery | +| _router_event | Counter | Dimensionless | the number of events received | +| config_hash | Gauge | Dimensionless | The hash of the current configuration | +| rule_config_hash | Gauge | Dimensionless | The hash of the current rules configuration | +| queue_length | Gauge | Dimensionless | number of events waiting to be sent to destination | +| queue_overflow | Counter | Dimensionless | number of events dropped due to queue overflow | +| send_errors | Counter | Dimensionless | number of errors encountered while sending events to destination | +| send_retries | Counter | Dimensionless | number of times a batch of events was retried | +| batches_sent | Counter | Dimensionless | number of batches of events sent to destination | +| messages_sent | Counter | Dimensionless | number of messages sent to destination | +| response_decode_errors | Counter | Dimensionless | number of errors encountered while decoding responses from destination | diff --git a/tools/convert/Makefile b/tools/convert/Makefile index 3d4e1a422b..37cb1054f8 100644 --- a/tools/convert/Makefile +++ b/tools/convert/Makefile @@ -1,6 +1,6 @@ .PHONY: all #: build all the things -all: template names sample complete docs validate build +all: template names sample metrics complete docs validate build .PHONY: build #: build the binary @@ -34,6 +34,24 @@ sample: @echo go run . sample --output=minimal_config.yaml +.PHONY: metricsmeta +metricsmeta: + @echo + @echo "+++ generating metrics metadata file" + @echo + go run . metricsmeta + +.PHONY: metricsdoc +metrics: + @echo + @echo "+++ generating metrics markdown documentation file" + @echo + go run . metrics --output=../../metrics.md + + +.PHONY: metrics +metrics: metricsmeta metricsdoc + .PHONY: complete #: generate the complete config complete: diff --git a/tools/convert/main.go b/tools/convert/main.go index 750d698e78..21e51cc334 100644 --- a/tools/convert/main.go +++ b/tools/convert/main.go @@ -105,6 +105,8 @@ func main() { convert validate rules: validate a rules file against the 2.0 format convert doc config: generate markdown documentation for the config file convert doc rules: generate markdown documentation for the rules file + convert metricsMeta: generates the metrics metadata file + convert metrics: generates markdown documentation for all refinery metrics Examples: convert config --input config.toml --output config.yaml @@ -173,6 +175,20 @@ func main() { os.Exit(1) } os.Exit(0) + case "metricsmeta": + err := GenerateMetricsMetadata() + if err != nil { + fmt.Fprintf(os.Stderr, `error generating metrics metadata: %v\n`, err) + os.Exit(1) + } + os.Exit(0) + case "metrics": + err := GenerateMetricsDoc(output) + if err != nil { + fmt.Fprintf(os.Stderr, `error generating metrics documentation: %v\n`, err) + os.Exit(1) + } + os.Exit(0) case "config", "rules", "validate", "helm": // do nothing yet because we need to parse the input file default: @@ -373,6 +389,34 @@ func GenerateTemplate(w io.Writer) { } } +func GenerateMetricsDoc(w io.Writer) error { + data, err := os.ReadFile("metricsMeta.yaml") + if err != nil { + return err + } + + var metricsUsages []MetricsUsage + err = yaml.Unmarshal(data, &metricsUsages) + if err != nil { + return err + } + + tmpl := template.New("metrics.tmpl") + tmpl.Funcs(helpers()) + tmpl, err = tmpl.ParseFS(filesystem, "templates/metrics.tmpl") + if err != nil { + return err + } + + err = tmpl.Execute(w, metricsUsages) + if err != nil { + return err + } + + fmt.Println("Metrics usages have been written to the output file") + return nil +} + // This generates a nested list of the groups and names. func PrintNames(w io.Writer) { metadata := loadConfigMetadata() diff --git a/tools/convert/metrics.go b/tools/convert/metrics.go new file mode 100644 index 0000000000..6c137789c4 --- /dev/null +++ b/tools/convert/metrics.go @@ -0,0 +1,157 @@ +package main + +import ( + "fmt" + "go/ast" + "go/types" + "os" + "slices" + "strings" + + "golang.org/x/exp/maps" + "golang.org/x/tools/go/packages" + "gopkg.in/yaml.v3" +) + +type MetricsUsage struct { + Name string + Type string + Unit string + Description string +} + +const metricsImportPath = "github.com/honeycombio/refinery/metrics" + +func GenerateMetricsMetadata() error { + output, err := os.Create("metricsMeta.yaml") + if err != nil { + return fmt.Errorf("error creating output file: %v", err) + } + defer output.Close() + + // Configuration to load Go packages. + cfg := &packages.Config{ + Mode: packages.NeedCompiledGoFiles | packages.NeedImports | packages.NeedName | packages.NeedSyntax | packages.NeedTypes | packages.NeedTypesInfo, + } + + // Load the package from the current directory. + pkgs, err := packages.Load(cfg, "github.com/honeycombio/refinery/...") + if err != nil { + return fmt.Errorf("error loading packages: %v", err) + } + + usages := make([]MetricsUsage, 0) + // Traverse each package and file. + for _, pkg := range pkgs { + if !slices.Contains(maps.Keys(pkg.Imports), metricsImportPath) { + continue + } + + var found bool + // Iterate over the syntax trees (ASTs) of each file in the package. + for _, syntax := range pkg.Syntax { + // Inspect the AST for each file. + ast.Inspect(syntax, func(n ast.Node) bool { + // Look for all slice type declarations + if decl, ok := n.(*ast.CompositeLit); ok { + + if arrayType, ok := decl.Type.(*ast.ArrayType); ok { + // Check if the element type of the array is a selector expression + if selector, ok := arrayType.Elt.(*ast.SelectorExpr); ok { + // Check if the package and type name match "metrics.Metadata" + if pkgIdent, ok := selector.X.(*ast.Ident); ok && pkgIdent.Name == "metrics" && selector.Sel.Name == "Metadata" { + + // Now extract the fields from the composite literal + for _, elt := range decl.Elts { + if comp, ok := elt.(*ast.CompositeLit); ok { + var usage MetricsUsage + for _, elt := range comp.Elts { + if kvExpr, ok := elt.(*ast.KeyValueExpr); ok { + field := exprToString(kvExpr.Key, pkg) + value := exprToString(kvExpr.Value, pkg) + + switch field { + case "Name": + usage.Name = value + case "Type": + usage.Type = value + case "Unit": + usage.Unit = value + case "Description": + usage.Description = value + + } + } + } + + if usage.Name == "" { + continue + } + usages = append(usages, usage) + found = true + } + } + } + } + } + } + return true + }) + + } + if !found { + return fmt.Errorf("Missing metrics.Metadata declaration in the package %s", pkg.Name) + } + } + + if len(usages) == 0 { + return fmt.Errorf("No metrics.Metadata declarations found in all packages") + } + + err = writeMetricsToYAML(usages, output) + if err != nil { + return fmt.Errorf("error writing metrics to YAML: %v", err) + } + + fmt.Printf("Metrics usages have been written to %s\n", output.Name()) + return nil + +} + +// exprToString is a helper function to convert ast.Expr to a string representation +func exprToString(expr ast.Expr, pkg *packages.Package) string { + var strVal string + switch v := expr.(type) { + case *ast.Ident: + if obj := pkg.TypesInfo.ObjectOf(v); obj != nil { + // Get the value of the variable (if constant) + if constVal, ok := obj.(*types.Const); ok { + strVal = constVal.Val().String() + break + } + + } + strVal = v.Name + case *ast.BasicLit: + strVal = v.Value + case *ast.SelectorExpr: + strVal = v.Sel.Name + default: + strVal = fmt.Sprintf("%T", expr) + } + + return strings.Trim(strVal, "\"") +} + +func writeMetricsToYAML(metricsUsages []MetricsUsage, output *os.File) error { + // Create a new YAML encoder and write the metrics + encoder := yaml.NewEncoder(output) + defer encoder.Close() + + err := encoder.Encode(metricsUsages) + if err != nil { + return fmt.Errorf("error encoding YAML: %v", err) + } + + return nil +} diff --git a/tools/convert/metricsMeta.yaml b/tools/convert/metricsMeta.yaml new file mode 100644 index 0000000000..4048d18c44 --- /dev/null +++ b/tools/convert/metricsMeta.yaml @@ -0,0 +1,280 @@ +- name: collect_cache_buffer_overrun + type: Counter + unit: Dimensionless + description: The number of times the trace overwritten in the circular buffer has not yet been sent +- name: collect_cache_capacity + type: Gauge + unit: Dimensionless + description: The number of traces that can be stored in the cache +- name: collect_cache_entries + type: Histogram + unit: Dimensionless + description: The number of traces currently stored in the cache +- name: cuckoo_current_capacity + type: Gauge + unit: Dimensionless + description: current capacity of the cuckoo filter +- name: cuckoo_future_load_factor + type: Gauge + unit: Percent + description: the fraction of slots occupied in the future cuckoo filter +- name: cuckoo_current_load_factor + type: Gauge + unit: Percent + description: the fraction of slots occupied in the current cuckoo filter +- name: cache_recent_dropped_traces + type: Gauge + unit: Dimensionless + description: the current size of the most recent dropped trace cache +- name: collect_sent_reasons_cache_entries + type: Histogram + unit: Dimensionless + description: Number of entries in the sent reasons cache +- name: is_ready + type: Gauge + unit: Dimensionless + description: Whether the system is ready to receive traffic +- name: is_alive + type: Gauge + unit: Dimensionless + description: Whether the system is alive and reporting in +- name: redis_pubsub_published + type: Counter + unit: Dimensionless + description: Number of messages published to Redis PubSub +- name: redis_pubsub_received + type: Counter + unit: Dimensionless + description: Number of messages received from Redis PubSub +- name: local_pubsub_published + type: Counter + unit: Dimensionless + description: The total number of messages sent via the local pubsub implementation +- name: local_pubsub_received + type: Counter + unit: Dimensionless + description: The total number of messages received via the local pubsub implementation +- name: num_file_peers + type: Gauge + unit: Dimensionless + description: Number of peers in the file peer list +- name: num_peers + type: Gauge + unit: Dimensionless + description: the active number of peers in the cluster +- name: peer_hash + type: Gauge + unit: Dimensionless + description: the hash of the current list of peers +- name: peer_messages + type: Counter + unit: Dimensionless + description: the number of messages received by the peers service +- name: _num_dropped_by_drop_rule + type: Counter + unit: Dimensionless + description: Number of traces dropped by the drop rule +- name: _num_dropped + type: Counter + unit: Dimensionless + description: Number of traces dropped by configured sampler +- name: _num_kept + type: Counter + unit: Dimensionless + description: Number of traces kept by configured sampler +- name: _sample_rate + type: Histogram + unit: Dimensionless + description: Sample rate for traces +- name: enqueue_errors + type: Counter + unit: Dimensionless + description: The number of errors encountered when enqueueing events +- name: response_20x + type: Counter + unit: Dimensionless + description: The number of successful responses from Honeycomb +- name: response_errors + type: Counter + unit: Dimensionless + description: The number of errors encountered when sending events to Honeycomb +- name: queued_items + type: UpDown + unit: Dimensionless + description: The number of events queued for transmission to Honeycomb +- name: queue_time + type: Histogram + unit: Microseconds + description: The time spent in the queue before being sent to Honeycomb +- name: trace_duration_ms + type: Histogram + unit: Milliseconds + description: time taken to process a trace from arrival to send +- name: trace_span_count + type: Histogram + unit: Dimensionless + description: number of spans in a trace +- name: collector_incoming_queue + type: Histogram + unit: Dimensionless + description: number of spans currently in the incoming queue +- name: collector_peer_queue_length + type: Gauge + unit: Dimensionless + description: number of spans in the peer queue +- name: collector_incoming_queue_length + type: Gauge + unit: Dimensionless + description: number of spans in the incoming queue +- name: collector_peer_queue + type: Histogram + unit: Dimensionless + description: number of spans currently in the peer queue +- name: collector_cache_size + type: Gauge + unit: Dimensionless + description: number of traces currently stored in the trace cache +- name: memory_heap_allocation + type: Gauge + unit: Bytes + description: current heap allocation +- name: span_received + type: Counter + unit: Dimensionless + description: number of spans received by the collector +- name: span_processed + type: Counter + unit: Dimensionless + description: number of spans processed by the collector +- name: spans_waiting + type: UpDown + unit: Dimensionless + description: number of spans waiting to be processed by the collector +- name: trace_sent_cache_hit + type: Counter + unit: Dimensionless + description: number of late spans received for traces that have already been sent +- name: trace_accepted + type: Counter + unit: Dimensionless + description: number of new traces received by the collector +- name: trace_send_kept + type: Counter + unit: Dimensionless + description: number of traces that has been kept +- name: trace_send_dropped + type: Counter + unit: Dimensionless + description: number of traces that has been dropped +- name: trace_send_has_root + type: Counter + unit: Dimensionless + description: number of kept traces that have a root span +- name: trace_send_no_root + type: Counter + unit: Dimensionless + description: number of kept traces that do not have a root span +- name: trace_forwarded_on_peer_change + type: Gauge + unit: Dimensionless + description: number of traces forwarded due to peer membership change +- name: trace_redistribution_count + type: Gauge + unit: Dimensionless + description: number of traces redistributed due to peer membership change +- name: trace_send_on_shutdown + type: Counter + unit: Dimensionless + description: number of traces sent during shutdown +- name: trace_forwarded_on_shutdown + type: Counter + unit: Dimensionless + description: number of traces forwarded during shutdown +- name: trace_send_got_root + type: Counter + unit: Dimensionless + description: number of traces that are ready for decision due to root span arrival +- name: trace_send_expired + type: Counter + unit: Dimensionless + description: number of traces that are ready for decision due to TraceTimeout or SendDelay +- name: trace_send_span_limit + type: Counter + unit: Dimensionless + description: number of traces that are ready for decision due to span limit +- name: trace_send_ejected_full + type: Counter + unit: Dimensionless + description: number of traces that are ready for decision due to cache capacity overrun +- name: trace_send_ejected_memsize + type: Counter + unit: Dimensionless + description: number of traces that are ready for decision due to memory overrun +- name: trace_send_late_span + type: Counter + unit: Dimensionless + description: number of spans that are sent due to late span arrival +- name: dropped_from_stress + type: Counter + unit: Dimensionless + description: number of traces dropped due to stress relief +- name: cluster_stress_level + type: Gauge + unit: Dimensionless + description: The overall stress level of the cluster +- name: individual_stress_level + type: Gauge + unit: Dimensionless + description: The stress level of the individual node +- name: stress_level + type: Gauge + unit: Dimensionless + description: The stress level that's being used to determine whether to activate stress relief +- name: stress_relief_activated + type: Gauge + unit: Dimensionless + description: Whether stress relief is currently activated +- name: _router_proxied + type: Counter + unit: Dimensionless + description: the number of events proxied to another refinery +- name: _router_event + type: Counter + unit: Dimensionless + description: the number of events received +- name: config_hash + type: Gauge + unit: Dimensionless + description: The hash of the current configuration +- name: rule_config_hash + type: Gauge + unit: Dimensionless + description: The hash of the current rules configuration +- name: queue_length + type: Gauge + unit: Dimensionless + description: number of events waiting to be sent to destination +- name: queue_overflow + type: Counter + unit: Dimensionless + description: number of events dropped due to queue overflow +- name: send_errors + type: Counter + unit: Dimensionless + description: number of errors encountered while sending events to destination +- name: send_retries + type: Counter + unit: Dimensionless + description: number of times a batch of events was retried +- name: batches_sent + type: Counter + unit: Dimensionless + description: number of batches of events sent to destination +- name: messages_sent + type: Counter + unit: Dimensionless + description: number of messages sent to destination +- name: response_decode_errors + type: Counter + unit: Dimensionless + description: number of errors encountered while decoding responses from destination diff --git a/tools/convert/templates/metrics.tmpl b/tools/convert/templates/metrics.tmpl new file mode 100644 index 0000000000..095578af36 --- /dev/null +++ b/tools/convert/templates/metrics.tmpl @@ -0,0 +1,10 @@ +# Metrics Documentation +# Automatically generated {{ now }} + +This document contains the description of various metrics used in the system. + +| Name | Type | Unit | Description | +|------|------|------|-------------| +{{- range . }} +| {{ .Name }} | {{ .Type }} | {{ .Unit }} | {{ .Description }} | +{{- end }} From 4e49ccb08e089d2051332c5d61028aed3cde8aaf Mon Sep 17 00:00:00 2001 From: Yingrong Zhao <22300958+VinozzZ@users.noreply.github.com> Date: Fri, 27 Sep 2024 14:06:06 -0400 Subject: [PATCH 12/25] feat(doc): separate table for metrics contains prefix (#1354) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit ## Which problem is this PR solving? The current metrics metadata generator does not resolve dynamically assigned prefixes during metrics registration. As a result, metrics with dynamic prefixes are not displayed with their full names. To improve clarity, these metrics should be shown in a separate table. ## Short description of the changes - Identified that certain packages in Refinery—specifically those used for peer and incoming nodes, as well as the sampling package—assign dynamic prefixes to their metrics. - Added logic to the generator to detect metrics from these packages and categorize them separately. - Updated the metrics.tmpl template to display metrics with dynamic prefixes in a distinct table, ensuring clear differentiation between metrics with and without full names. --- collect/collect.go | 2 + config/metadata/configMeta.yaml | 1 + metrics.md | 43 +- tools/convert/main.go | 2 +- tools/convert/metrics.go | 21 +- tools/convert/metricsMeta.yaml | 570 ++++++++++++++------------- tools/convert/templates/metrics.tmpl | 24 +- 7 files changed, 359 insertions(+), 304 deletions(-) diff --git a/collect/collect.go b/collect/collect.go index 58dd6ba6f0..59a5f85d53 100644 --- a/collect/collect.go +++ b/collect/collect.go @@ -126,6 +126,8 @@ var inMemCollectorMetrics = []metrics.Metadata{ {Name: TraceSendLateSpan, Type: metrics.Counter, Unit: metrics.Dimensionless, Description: "number of spans that are sent due to late span arrival"}, {Name: "dropped_from_stress", Type: metrics.Counter, Unit: metrics.Dimensionless, Description: "number of traces dropped due to stress relief"}, + {Name: "trace_kept_sample_rate", Type: metrics.Histogram, Unit: metrics.Dimensionless, Description: "sample rate of kept traces"}, + {Name: "trace_aggregate_sample_rate", Type: metrics.Histogram, Unit: metrics.Dimensionless, Description: "aggregate sample rate of both kept and dropped traces"}, } func (i *InMemCollector) Start() error { diff --git a/config/metadata/configMeta.yaml b/config/metadata/configMeta.yaml index a0d1823e9b..448aa40e52 100644 --- a/config/metadata/configMeta.yaml +++ b/config/metadata/configMeta.yaml @@ -357,6 +357,7 @@ groups: - name: SpanLimit type: int valuetype: nondefault + firstversion: v2.8 default: 0 reload: true summary: is the number of spans after which a trace becomes eligible for a trace decision. diff --git a/metrics.md b/metrics.md index 71d7468e2a..9060900d97 100644 --- a/metrics.md +++ b/metrics.md @@ -1,7 +1,12 @@ -# Metrics Documentation -# Automatically generated on 2024-09-26 at 18:58:58 UTC +# Honeycomb Refinery Metrics Documentation -This document contains the description of various metrics used in the system. +This document contains the description of various metrics used in Refinery. +It was automatically generated on 2024-09-27 at 16:19:55 UTC. + +Note: This document does not include metrics defined in the dynsampler-go dependency, as those metrics are generated dynamically at runtime. As a result, certain metrics may be missing or incomplete in this document, but they will still be available during execution with their full names. + +## Complete Metrics +This table includes metrics with fully defined names. | Name | Type | Unit | Description | |------|------|------|-------------| @@ -23,15 +28,6 @@ This document contains the description of various metrics used in the system. | num_peers | Gauge | Dimensionless | the active number of peers in the cluster | | peer_hash | Gauge | Dimensionless | the hash of the current list of peers | | peer_messages | Counter | Dimensionless | the number of messages received by the peers service | -| _num_dropped_by_drop_rule | Counter | Dimensionless | Number of traces dropped by the drop rule | -| _num_dropped | Counter | Dimensionless | Number of traces dropped by configured sampler | -| _num_kept | Counter | Dimensionless | Number of traces kept by configured sampler | -| _sample_rate | Histogram | Dimensionless | Sample rate for traces | -| enqueue_errors | Counter | Dimensionless | The number of errors encountered when enqueueing events | -| response_20x | Counter | Dimensionless | The number of successful responses from Honeycomb | -| response_errors | Counter | Dimensionless | The number of errors encountered when sending events to Honeycomb | -| queued_items | UpDown | Dimensionless | The number of events queued for transmission to Honeycomb | -| queue_time | Histogram | Microseconds | The time spent in the queue before being sent to Honeycomb | | trace_duration_ms | Histogram | Milliseconds | time taken to process a trace from arrival to send | | trace_span_count | Histogram | Dimensionless | number of spans in a trace | | collector_incoming_queue | Histogram | Dimensionless | number of spans currently in the incoming queue | @@ -60,14 +56,33 @@ This document contains the description of various metrics used in the system. | trace_send_ejected_memsize | Counter | Dimensionless | number of traces that are ready for decision due to memory overrun | | trace_send_late_span | Counter | Dimensionless | number of spans that are sent due to late span arrival | | dropped_from_stress | Counter | Dimensionless | number of traces dropped due to stress relief | +| trace_kept_sample_rate | Histogram | Dimensionless | sample rate of kept traces | +| trace_aggregate_sample_rate | Histogram | Dimensionless | aggregate sample rate of both kept and dropped traces | | cluster_stress_level | Gauge | Dimensionless | The overall stress level of the cluster | | individual_stress_level | Gauge | Dimensionless | The stress level of the individual node | | stress_level | Gauge | Dimensionless | The stress level that's being used to determine whether to activate stress relief | | stress_relief_activated | Gauge | Dimensionless | Whether stress relief is currently activated | -| _router_proxied | Counter | Dimensionless | the number of events proxied to another refinery | -| _router_event | Counter | Dimensionless | the number of events received | | config_hash | Gauge | Dimensionless | The hash of the current configuration | | rule_config_hash | Gauge | Dimensionless | The hash of the current rules configuration | + + +## Metrics with Prefix +This table includes metrics with partially defined names. +Metrics in this table don't contain their expected prefixes. This is because the auto-generator is unable to resolve dynamically created metric names during the generation process. + +| Name | Type | Unit | Description | +|------|------|------|-------------| +| _num_dropped_by_drop_rule | Counter | Dimensionless | Number of traces dropped by the drop rule | +| _num_dropped | Counter | Dimensionless | Number of traces dropped by configured sampler | +| _num_kept | Counter | Dimensionless | Number of traces kept by configured sampler | +| _sample_rate | Histogram | Dimensionless | Sample rate for traces | +| enqueue_errors | Counter | Dimensionless | The number of errors encountered when enqueueing events | +| response_20x | Counter | Dimensionless | The number of successful responses from Honeycomb | +| response_errors | Counter | Dimensionless | The number of errors encountered when sending events to Honeycomb | +| queued_items | UpDown | Dimensionless | The number of events queued for transmission to Honeycomb | +| queue_time | Histogram | Microseconds | The time spent in the queue before being sent to Honeycomb | +| _router_proxied | Counter | Dimensionless | the number of events proxied to another refinery | +| _router_event | Counter | Dimensionless | the number of events received | | queue_length | Gauge | Dimensionless | number of events waiting to be sent to destination | | queue_overflow | Counter | Dimensionless | number of events dropped due to queue overflow | | send_errors | Counter | Dimensionless | number of errors encountered while sending events to destination | diff --git a/tools/convert/main.go b/tools/convert/main.go index 21e51cc334..8ad1438459 100644 --- a/tools/convert/main.go +++ b/tools/convert/main.go @@ -395,7 +395,7 @@ func GenerateMetricsDoc(w io.Writer) error { return err } - var metricsUsages []MetricsUsage + var metricsUsages MetricsOutput err = yaml.Unmarshal(data, &metricsUsages) if err != nil { return err diff --git a/tools/convert/metrics.go b/tools/convert/metrics.go index 6c137789c4..46b7678fa7 100644 --- a/tools/convert/metrics.go +++ b/tools/convert/metrics.go @@ -20,8 +20,15 @@ type MetricsUsage struct { Description string } +type MetricsOutput struct { + Complete []MetricsUsage + HasPrefix []MetricsUsage +} + const metricsImportPath = "github.com/honeycombio/refinery/metrics" +var packagesContainsPrefix = []string{"route", "main", "sample", "transmit"} + func GenerateMetricsMetadata() error { output, err := os.Create("metricsMeta.yaml") if err != nil { @@ -40,7 +47,7 @@ func GenerateMetricsMetadata() error { return fmt.Errorf("error loading packages: %v", err) } - usages := make([]MetricsUsage, 0) + var usages MetricsOutput // Traverse each package and file. for _, pkg := range pkgs { if !slices.Contains(maps.Keys(pkg.Imports), metricsImportPath) { @@ -87,7 +94,12 @@ func GenerateMetricsMetadata() error { if usage.Name == "" { continue } - usages = append(usages, usage) + if slices.Contains(packagesContainsPrefix, pkg.Name) { + fmt.Println("name", pkg.Name) + usages.HasPrefix = append(usages.HasPrefix, usage) + } else { + usages.Complete = append(usages.Complete, usage) + } found = true } } @@ -104,7 +116,7 @@ func GenerateMetricsMetadata() error { } } - if len(usages) == 0 { + if len(usages.Complete) == 0 && len(usages.HasPrefix) == 0 { return fmt.Errorf("No metrics.Metadata declarations found in all packages") } @@ -129,7 +141,6 @@ func exprToString(expr ast.Expr, pkg *packages.Package) string { strVal = constVal.Val().String() break } - } strVal = v.Name case *ast.BasicLit: @@ -143,7 +154,7 @@ func exprToString(expr ast.Expr, pkg *packages.Package) string { return strings.Trim(strVal, "\"") } -func writeMetricsToYAML(metricsUsages []MetricsUsage, output *os.File) error { +func writeMetricsToYAML(metricsUsages MetricsOutput, output *os.File) error { // Create a new YAML encoder and write the metrics encoder := yaml.NewEncoder(output) defer encoder.Close() diff --git a/tools/convert/metricsMeta.yaml b/tools/convert/metricsMeta.yaml index 4048d18c44..4ec8c48da9 100644 --- a/tools/convert/metricsMeta.yaml +++ b/tools/convert/metricsMeta.yaml @@ -1,280 +1,290 @@ -- name: collect_cache_buffer_overrun - type: Counter - unit: Dimensionless - description: The number of times the trace overwritten in the circular buffer has not yet been sent -- name: collect_cache_capacity - type: Gauge - unit: Dimensionless - description: The number of traces that can be stored in the cache -- name: collect_cache_entries - type: Histogram - unit: Dimensionless - description: The number of traces currently stored in the cache -- name: cuckoo_current_capacity - type: Gauge - unit: Dimensionless - description: current capacity of the cuckoo filter -- name: cuckoo_future_load_factor - type: Gauge - unit: Percent - description: the fraction of slots occupied in the future cuckoo filter -- name: cuckoo_current_load_factor - type: Gauge - unit: Percent - description: the fraction of slots occupied in the current cuckoo filter -- name: cache_recent_dropped_traces - type: Gauge - unit: Dimensionless - description: the current size of the most recent dropped trace cache -- name: collect_sent_reasons_cache_entries - type: Histogram - unit: Dimensionless - description: Number of entries in the sent reasons cache -- name: is_ready - type: Gauge - unit: Dimensionless - description: Whether the system is ready to receive traffic -- name: is_alive - type: Gauge - unit: Dimensionless - description: Whether the system is alive and reporting in -- name: redis_pubsub_published - type: Counter - unit: Dimensionless - description: Number of messages published to Redis PubSub -- name: redis_pubsub_received - type: Counter - unit: Dimensionless - description: Number of messages received from Redis PubSub -- name: local_pubsub_published - type: Counter - unit: Dimensionless - description: The total number of messages sent via the local pubsub implementation -- name: local_pubsub_received - type: Counter - unit: Dimensionless - description: The total number of messages received via the local pubsub implementation -- name: num_file_peers - type: Gauge - unit: Dimensionless - description: Number of peers in the file peer list -- name: num_peers - type: Gauge - unit: Dimensionless - description: the active number of peers in the cluster -- name: peer_hash - type: Gauge - unit: Dimensionless - description: the hash of the current list of peers -- name: peer_messages - type: Counter - unit: Dimensionless - description: the number of messages received by the peers service -- name: _num_dropped_by_drop_rule - type: Counter - unit: Dimensionless - description: Number of traces dropped by the drop rule -- name: _num_dropped - type: Counter - unit: Dimensionless - description: Number of traces dropped by configured sampler -- name: _num_kept - type: Counter - unit: Dimensionless - description: Number of traces kept by configured sampler -- name: _sample_rate - type: Histogram - unit: Dimensionless - description: Sample rate for traces -- name: enqueue_errors - type: Counter - unit: Dimensionless - description: The number of errors encountered when enqueueing events -- name: response_20x - type: Counter - unit: Dimensionless - description: The number of successful responses from Honeycomb -- name: response_errors - type: Counter - unit: Dimensionless - description: The number of errors encountered when sending events to Honeycomb -- name: queued_items - type: UpDown - unit: Dimensionless - description: The number of events queued for transmission to Honeycomb -- name: queue_time - type: Histogram - unit: Microseconds - description: The time spent in the queue before being sent to Honeycomb -- name: trace_duration_ms - type: Histogram - unit: Milliseconds - description: time taken to process a trace from arrival to send -- name: trace_span_count - type: Histogram - unit: Dimensionless - description: number of spans in a trace -- name: collector_incoming_queue - type: Histogram - unit: Dimensionless - description: number of spans currently in the incoming queue -- name: collector_peer_queue_length - type: Gauge - unit: Dimensionless - description: number of spans in the peer queue -- name: collector_incoming_queue_length - type: Gauge - unit: Dimensionless - description: number of spans in the incoming queue -- name: collector_peer_queue - type: Histogram - unit: Dimensionless - description: number of spans currently in the peer queue -- name: collector_cache_size - type: Gauge - unit: Dimensionless - description: number of traces currently stored in the trace cache -- name: memory_heap_allocation - type: Gauge - unit: Bytes - description: current heap allocation -- name: span_received - type: Counter - unit: Dimensionless - description: number of spans received by the collector -- name: span_processed - type: Counter - unit: Dimensionless - description: number of spans processed by the collector -- name: spans_waiting - type: UpDown - unit: Dimensionless - description: number of spans waiting to be processed by the collector -- name: trace_sent_cache_hit - type: Counter - unit: Dimensionless - description: number of late spans received for traces that have already been sent -- name: trace_accepted - type: Counter - unit: Dimensionless - description: number of new traces received by the collector -- name: trace_send_kept - type: Counter - unit: Dimensionless - description: number of traces that has been kept -- name: trace_send_dropped - type: Counter - unit: Dimensionless - description: number of traces that has been dropped -- name: trace_send_has_root - type: Counter - unit: Dimensionless - description: number of kept traces that have a root span -- name: trace_send_no_root - type: Counter - unit: Dimensionless - description: number of kept traces that do not have a root span -- name: trace_forwarded_on_peer_change - type: Gauge - unit: Dimensionless - description: number of traces forwarded due to peer membership change -- name: trace_redistribution_count - type: Gauge - unit: Dimensionless - description: number of traces redistributed due to peer membership change -- name: trace_send_on_shutdown - type: Counter - unit: Dimensionless - description: number of traces sent during shutdown -- name: trace_forwarded_on_shutdown - type: Counter - unit: Dimensionless - description: number of traces forwarded during shutdown -- name: trace_send_got_root - type: Counter - unit: Dimensionless - description: number of traces that are ready for decision due to root span arrival -- name: trace_send_expired - type: Counter - unit: Dimensionless - description: number of traces that are ready for decision due to TraceTimeout or SendDelay -- name: trace_send_span_limit - type: Counter - unit: Dimensionless - description: number of traces that are ready for decision due to span limit -- name: trace_send_ejected_full - type: Counter - unit: Dimensionless - description: number of traces that are ready for decision due to cache capacity overrun -- name: trace_send_ejected_memsize - type: Counter - unit: Dimensionless - description: number of traces that are ready for decision due to memory overrun -- name: trace_send_late_span - type: Counter - unit: Dimensionless - description: number of spans that are sent due to late span arrival -- name: dropped_from_stress - type: Counter - unit: Dimensionless - description: number of traces dropped due to stress relief -- name: cluster_stress_level - type: Gauge - unit: Dimensionless - description: The overall stress level of the cluster -- name: individual_stress_level - type: Gauge - unit: Dimensionless - description: The stress level of the individual node -- name: stress_level - type: Gauge - unit: Dimensionless - description: The stress level that's being used to determine whether to activate stress relief -- name: stress_relief_activated - type: Gauge - unit: Dimensionless - description: Whether stress relief is currently activated -- name: _router_proxied - type: Counter - unit: Dimensionless - description: the number of events proxied to another refinery -- name: _router_event - type: Counter - unit: Dimensionless - description: the number of events received -- name: config_hash - type: Gauge - unit: Dimensionless - description: The hash of the current configuration -- name: rule_config_hash - type: Gauge - unit: Dimensionless - description: The hash of the current rules configuration -- name: queue_length - type: Gauge - unit: Dimensionless - description: number of events waiting to be sent to destination -- name: queue_overflow - type: Counter - unit: Dimensionless - description: number of events dropped due to queue overflow -- name: send_errors - type: Counter - unit: Dimensionless - description: number of errors encountered while sending events to destination -- name: send_retries - type: Counter - unit: Dimensionless - description: number of times a batch of events was retried -- name: batches_sent - type: Counter - unit: Dimensionless - description: number of batches of events sent to destination -- name: messages_sent - type: Counter - unit: Dimensionless - description: number of messages sent to destination -- name: response_decode_errors - type: Counter - unit: Dimensionless - description: number of errors encountered while decoding responses from destination +complete: + - name: collect_cache_buffer_overrun + type: Counter + unit: Dimensionless + description: The number of times the trace overwritten in the circular buffer has not yet been sent + - name: collect_cache_capacity + type: Gauge + unit: Dimensionless + description: The number of traces that can be stored in the cache + - name: collect_cache_entries + type: Histogram + unit: Dimensionless + description: The number of traces currently stored in the cache + - name: cuckoo_current_capacity + type: Gauge + unit: Dimensionless + description: current capacity of the cuckoo filter + - name: cuckoo_future_load_factor + type: Gauge + unit: Percent + description: the fraction of slots occupied in the future cuckoo filter + - name: cuckoo_current_load_factor + type: Gauge + unit: Percent + description: the fraction of slots occupied in the current cuckoo filter + - name: cache_recent_dropped_traces + type: Gauge + unit: Dimensionless + description: the current size of the most recent dropped trace cache + - name: collect_sent_reasons_cache_entries + type: Histogram + unit: Dimensionless + description: Number of entries in the sent reasons cache + - name: is_ready + type: Gauge + unit: Dimensionless + description: Whether the system is ready to receive traffic + - name: is_alive + type: Gauge + unit: Dimensionless + description: Whether the system is alive and reporting in + - name: redis_pubsub_published + type: Counter + unit: Dimensionless + description: Number of messages published to Redis PubSub + - name: redis_pubsub_received + type: Counter + unit: Dimensionless + description: Number of messages received from Redis PubSub + - name: local_pubsub_published + type: Counter + unit: Dimensionless + description: The total number of messages sent via the local pubsub implementation + - name: local_pubsub_received + type: Counter + unit: Dimensionless + description: The total number of messages received via the local pubsub implementation + - name: num_file_peers + type: Gauge + unit: Dimensionless + description: Number of peers in the file peer list + - name: num_peers + type: Gauge + unit: Dimensionless + description: the active number of peers in the cluster + - name: peer_hash + type: Gauge + unit: Dimensionless + description: the hash of the current list of peers + - name: peer_messages + type: Counter + unit: Dimensionless + description: the number of messages received by the peers service + - name: trace_duration_ms + type: Histogram + unit: Milliseconds + description: time taken to process a trace from arrival to send + - name: trace_span_count + type: Histogram + unit: Dimensionless + description: number of spans in a trace + - name: collector_incoming_queue + type: Histogram + unit: Dimensionless + description: number of spans currently in the incoming queue + - name: collector_peer_queue_length + type: Gauge + unit: Dimensionless + description: number of spans in the peer queue + - name: collector_incoming_queue_length + type: Gauge + unit: Dimensionless + description: number of spans in the incoming queue + - name: collector_peer_queue + type: Histogram + unit: Dimensionless + description: number of spans currently in the peer queue + - name: collector_cache_size + type: Gauge + unit: Dimensionless + description: number of traces currently stored in the trace cache + - name: memory_heap_allocation + type: Gauge + unit: Bytes + description: current heap allocation + - name: span_received + type: Counter + unit: Dimensionless + description: number of spans received by the collector + - name: span_processed + type: Counter + unit: Dimensionless + description: number of spans processed by the collector + - name: spans_waiting + type: UpDown + unit: Dimensionless + description: number of spans waiting to be processed by the collector + - name: trace_sent_cache_hit + type: Counter + unit: Dimensionless + description: number of late spans received for traces that have already been sent + - name: trace_accepted + type: Counter + unit: Dimensionless + description: number of new traces received by the collector + - name: trace_send_kept + type: Counter + unit: Dimensionless + description: number of traces that has been kept + - name: trace_send_dropped + type: Counter + unit: Dimensionless + description: number of traces that has been dropped + - name: trace_send_has_root + type: Counter + unit: Dimensionless + description: number of kept traces that have a root span + - name: trace_send_no_root + type: Counter + unit: Dimensionless + description: number of kept traces that do not have a root span + - name: trace_forwarded_on_peer_change + type: Gauge + unit: Dimensionless + description: number of traces forwarded due to peer membership change + - name: trace_redistribution_count + type: Gauge + unit: Dimensionless + description: number of traces redistributed due to peer membership change + - name: trace_send_on_shutdown + type: Counter + unit: Dimensionless + description: number of traces sent during shutdown + - name: trace_forwarded_on_shutdown + type: Counter + unit: Dimensionless + description: number of traces forwarded during shutdown + - name: trace_send_got_root + type: Counter + unit: Dimensionless + description: number of traces that are ready for decision due to root span arrival + - name: trace_send_expired + type: Counter + unit: Dimensionless + description: number of traces that are ready for decision due to TraceTimeout or SendDelay + - name: trace_send_span_limit + type: Counter + unit: Dimensionless + description: number of traces that are ready for decision due to span limit + - name: trace_send_ejected_full + type: Counter + unit: Dimensionless + description: number of traces that are ready for decision due to cache capacity overrun + - name: trace_send_ejected_memsize + type: Counter + unit: Dimensionless + description: number of traces that are ready for decision due to memory overrun + - name: trace_send_late_span + type: Counter + unit: Dimensionless + description: number of spans that are sent due to late span arrival + - name: dropped_from_stress + type: Counter + unit: Dimensionless + description: number of traces dropped due to stress relief + - name: trace_kept_sample_rate + type: Histogram + unit: Dimensionless + description: sample rate of kept traces + - name: trace_aggregate_sample_rate + type: Histogram + unit: Dimensionless + description: aggregate sample rate of both kept and dropped traces + - name: cluster_stress_level + type: Gauge + unit: Dimensionless + description: The overall stress level of the cluster + - name: individual_stress_level + type: Gauge + unit: Dimensionless + description: The stress level of the individual node + - name: stress_level + type: Gauge + unit: Dimensionless + description: The stress level that's being used to determine whether to activate stress relief + - name: stress_relief_activated + type: Gauge + unit: Dimensionless + description: Whether stress relief is currently activated + - name: config_hash + type: Gauge + unit: Dimensionless + description: The hash of the current configuration + - name: rule_config_hash + type: Gauge + unit: Dimensionless + description: The hash of the current rules configuration +hasprefix: + - name: _num_dropped_by_drop_rule + type: Counter + unit: Dimensionless + description: Number of traces dropped by the drop rule + - name: _num_dropped + type: Counter + unit: Dimensionless + description: Number of traces dropped by configured sampler + - name: _num_kept + type: Counter + unit: Dimensionless + description: Number of traces kept by configured sampler + - name: _sample_rate + type: Histogram + unit: Dimensionless + description: Sample rate for traces + - name: enqueue_errors + type: Counter + unit: Dimensionless + description: The number of errors encountered when enqueueing events + - name: response_20x + type: Counter + unit: Dimensionless + description: The number of successful responses from Honeycomb + - name: response_errors + type: Counter + unit: Dimensionless + description: The number of errors encountered when sending events to Honeycomb + - name: queued_items + type: UpDown + unit: Dimensionless + description: The number of events queued for transmission to Honeycomb + - name: queue_time + type: Histogram + unit: Microseconds + description: The time spent in the queue before being sent to Honeycomb + - name: _router_proxied + type: Counter + unit: Dimensionless + description: the number of events proxied to another refinery + - name: _router_event + type: Counter + unit: Dimensionless + description: the number of events received + - name: queue_length + type: Gauge + unit: Dimensionless + description: number of events waiting to be sent to destination + - name: queue_overflow + type: Counter + unit: Dimensionless + description: number of events dropped due to queue overflow + - name: send_errors + type: Counter + unit: Dimensionless + description: number of errors encountered while sending events to destination + - name: send_retries + type: Counter + unit: Dimensionless + description: number of times a batch of events was retried + - name: batches_sent + type: Counter + unit: Dimensionless + description: number of batches of events sent to destination + - name: messages_sent + type: Counter + unit: Dimensionless + description: number of messages sent to destination + - name: response_decode_errors + type: Counter + unit: Dimensionless + description: number of errors encountered while decoding responses from destination diff --git a/tools/convert/templates/metrics.tmpl b/tools/convert/templates/metrics.tmpl index 095578af36..44f44595fa 100644 --- a/tools/convert/templates/metrics.tmpl +++ b/tools/convert/templates/metrics.tmpl @@ -1,10 +1,26 @@ -# Metrics Documentation -# Automatically generated {{ now }} +# Honeycomb Refinery Metrics Documentation -This document contains the description of various metrics used in the system. +This document contains the description of various metrics used in Refinery. +It was automatically generated {{ now }}. + +Note: This document does not include metrics defined in the dynsampler-go dependency, as those metrics are generated dynamically at runtime. As a result, certain metrics may be missing or incomplete in this document, but they will still be available during execution with their full names. + +## Complete Metrics +This table includes metrics with fully defined names. + +| Name | Type | Unit | Description | +|------|------|------|-------------| +{{- range .Complete }} +| {{ .Name }} | {{ .Type }} | {{ .Unit }} | {{ .Description }} | +{{- end }} + + +## Metrics with Prefix +This table includes metrics with partially defined names. +Metrics in this table don't contain their expected prefixes. This is because the auto-generator is unable to resolve dynamically created metric names during the generation process. | Name | Type | Unit | Description | |------|------|------|-------------| -{{- range . }} +{{- range .HasPrefix }} | {{ .Name }} | {{ .Type }} | {{ .Unit }} | {{ .Description }} | {{- end }} From 19986e4b5a0eb1925c115880a3add2667fd32bc1 Mon Sep 17 00:00:00 2001 From: Yingrong Zhao <22300958+VinozzZ@users.noreply.github.com> Date: Mon, 30 Sep 2024 14:01:26 -0400 Subject: [PATCH 13/25] feat: extract decision span from full span (#1338) Extract only necessary information that's needed for trace decision from a full span so that we can forward only the key fields to peers later part of #1318 - add a method on `Span` to extract only necessary information into a new types.Event - add a method to differentiate a full span from a decision span - store `IsRoot` information on `Span` - simplify naming for span annotation types --- collect/collect.go | 24 ++---------- collect/collect_test.go | 86 ++++++++++------------------------------- types/event.go | 54 +++++++++++++++++++++++++- types/event_test.go | 66 +++++++++++++++++++++++++++++++ 4 files changed, 142 insertions(+), 88 deletions(-) diff --git a/collect/collect.go b/collect/collect.go index 59a5f85d53..4faad8681e 100644 --- a/collect/collect.go +++ b/collect/collect.go @@ -547,7 +547,7 @@ func (i *InMemCollector) processSpan(sp *types.Span) { } // if this is a root span, say so and send the trace - if i.isRootSpan(sp) { + if sp.IsRoot { markTraceForSending = true trace.RootSpan = sp } @@ -673,8 +673,7 @@ func (i *InMemCollector) dealWithSentTrace(ctx context.Context, tr cache.TraceSe i.Logger.Debug().WithField("trace_id", sp.TraceID).Logf("Sending span because of previous decision to send trace") mergeTraceAndSpanSampleRates(sp, tr.Rate(), isDryRun) // if this span is a late root span, possibly update it with our current span count - isRootSpan := i.isRootSpan(sp) - if isRootSpan { + if sp.IsRoot { if i.Config.GetAddCountsToRoot() { sp.Data["meta.span_event_count"] = int64(tr.SpanEventCount()) sp.Data["meta.span_link_count"] = int64(tr.SpanLinkCount()) @@ -684,7 +683,7 @@ func (i *InMemCollector) dealWithSentTrace(ctx context.Context, tr cache.TraceSe sp.Data["meta.span_count"] = int64(tr.DescendantCount()) } } - otelutil.AddSpanField(span, "is_root_span", isRootSpan) + otelutil.AddSpanField(span, "is_root_span", sp.IsRoot) i.Metrics.Increment(TraceSendLateSpan) i.addAdditionalAttributes(sp) i.Transmission.EnqueueSpan(sp) @@ -720,21 +719,6 @@ func mergeTraceAndSpanSampleRates(sp *types.Span, traceSampleRate uint, dryRunMo } } -func (i *InMemCollector) isRootSpan(sp *types.Span) bool { - // log event should never be considered a root span, check for that first - if signalType := sp.Data["meta.signal_type"]; signalType == "log" { - return false - } - // check if the event has a parent id using the configured parent id field names - for _, parentIdFieldName := range i.Config.GetParentIdFieldNames() { - parentId := sp.Data[parentIdFieldName] - if _, ok := parentId.(string); ok && parentId != "" { - return false - } - } - return true -} - func (i *InMemCollector) send(trace *types.Trace, sendReason string) { if trace.Sent { // someone else already sent this so we shouldn't also send it. @@ -830,7 +814,7 @@ func (i *InMemCollector) send(trace *types.Trace, sendReason string) { // update the root span (if we have one, which we might not if the trace timed out) // with the final total as of our send time - if i.isRootSpan(sp) { + if sp.IsRoot { if i.Config.GetAddCountsToRoot() { sp.Data["meta.span_event_count"] = int64(trace.SpanEventCount()) sp.Data["meta.span_link_count"] = int64(trace.SpanLinkCount()) diff --git a/collect/collect_test.go b/collect/collect_test.go index 68ee4020f9..fe78fb602d 100644 --- a/collect/collect_test.go +++ b/collect/collect_test.go @@ -120,6 +120,7 @@ func TestAddRootSpan(t *testing.T) { Dataset: "aoeu", APIKey: legacyAPIKey, }, + IsRoot: true, } coll.AddSpan(span) time.Sleep(conf.GetTracesConfig().GetSendTickerValue() * 2) @@ -141,6 +142,7 @@ func TestAddRootSpan(t *testing.T) { Dataset: "aoeu", APIKey: legacyAPIKey, }, + IsRoot: true, } coll.AddSpanFromPeer(span) time.Sleep(conf.GetTracesConfig().GetSendTickerValue() * 2) @@ -233,6 +235,7 @@ func TestOriginalSampleRateIsNotedInMetaField(t *testing.T) { SampleRate: 0, // no upstream sampling Data: make(map[string]interface{}), }, + IsRoot: true, }) require.NoError(t, err, "must be able to add the span") @@ -293,6 +296,7 @@ func TestTransmittedSpansShouldHaveASampleRateOfAtLeastOne(t *testing.T) { SampleRate: 0, // This should get lifted to 1 Data: make(map[string]interface{}), }, + IsRoot: true, } coll.AddSpan(span) @@ -378,6 +382,7 @@ func TestAddSpan(t *testing.T) { Data: map[string]interface{}{}, APIKey: legacyAPIKey, }, + IsRoot: true, } coll.AddSpan(rootSpan) time.Sleep(conf.GetTracesConfig().GetSendTickerValue() * 5) @@ -452,6 +457,7 @@ func TestDryRunMode(t *testing.T) { Data: map[string]interface{}{}, APIKey: legacyAPIKey, }, + IsRoot: true, } coll.AddSpan(span) time.Sleep(conf.GetTracesConfig().GetSendTickerValue() * 2) @@ -488,6 +494,7 @@ func TestDryRunMode(t *testing.T) { Data: map[string]interface{}{}, APIKey: legacyAPIKey, }, + IsRoot: true, } coll.AddSpanFromPeer(span) time.Sleep(conf.GetTracesConfig().GetSendTickerValue() * 2) @@ -511,6 +518,7 @@ func TestDryRunMode(t *testing.T) { Data: map[string]interface{}{}, APIKey: legacyAPIKey, }, + IsRoot: true, } coll.AddSpan(span) time.Sleep(conf.GetTracesConfig().GetSendTickerValue() * 2) @@ -642,6 +650,7 @@ func TestSampleConfigReload(t *testing.T) { Dataset: dataset, APIKey: legacyAPIKey, }, + IsRoot: true, } coll.AddSpan(span) @@ -669,6 +678,7 @@ func TestSampleConfigReload(t *testing.T) { Dataset: dataset, APIKey: legacyAPIKey, }, + IsRoot: true, } coll.AddSpan(span) @@ -930,6 +940,7 @@ func TestAddCountsToRoot(t *testing.T) { Data: map[string]interface{}{}, APIKey: legacyAPIKey, }, + IsRoot: true, } coll.AddSpan(rootSpan) time.Sleep(conf.GetTracesConfig().GetSendTickerValue() * 2) @@ -1021,6 +1032,7 @@ func TestLateRootGetsCounts(t *testing.T) { Data: map[string]interface{}{}, APIKey: legacyAPIKey, }, + IsRoot: true, } coll.AddSpan(rootSpan) time.Sleep(conf.GetTracesConfig().GetSendTickerValue() * 2) @@ -1101,6 +1113,7 @@ func TestAddSpanCount(t *testing.T) { Data: map[string]interface{}{}, APIKey: legacyAPIKey, }, + IsRoot: true, } coll.AddSpan(rootSpan) time.Sleep(conf.GetTracesConfig().GetSendTickerValue() * 2) @@ -1176,6 +1189,7 @@ func TestLateRootGetsSpanCount(t *testing.T) { Data: map[string]interface{}{}, APIKey: legacyAPIKey, }, + IsRoot: true, } coll.AddSpan(rootSpan) time.Sleep(conf.GetTracesConfig().GetSendTickerValue() * 2) @@ -1247,6 +1261,7 @@ func TestLateSpanNotDecorated(t *testing.T) { Data: map[string]interface{}{}, APIKey: legacyAPIKey, }, + IsRoot: true, } coll.AddSpan(rootSpan) @@ -1317,6 +1332,7 @@ func TestAddAdditionalAttributes(t *testing.T) { Data: map[string]interface{}{}, APIKey: legacyAPIKey, }, + IsRoot: true, } coll.AddSpan(rootSpan) time.Sleep(conf.GetTracesConfig().GetSendTickerValue() * 5) @@ -1393,6 +1409,7 @@ func TestStressReliefSampleRate(t *testing.T) { APIKey: legacyAPIKey, SampleRate: 10, }, + IsRoot: true, } processed2, kept2 := coll.ProcessSpanImmediately(rootSpan) @@ -1473,6 +1490,7 @@ func TestStressReliefDecorateHostname(t *testing.T) { Data: map[string]interface{}{}, APIKey: legacyAPIKey, }, + IsRoot: true, } coll.AddSpan(rootSpan) time.Sleep(conf.GetTracesConfig().GetSendTickerValue() * 2) @@ -1591,6 +1609,7 @@ func TestSpanWithRuleReasons(t *testing.T) { }, APIKey: legacyAPIKey, }, + IsRoot: true, } if i == 0 { rootSpan.Data["test"] = int64(1) @@ -1624,72 +1643,6 @@ func TestSpanWithRuleReasons(t *testing.T) { transmission.Mux.RUnlock() } -func TestIsRootSpan(t *testing.T) { - tesCases := []struct { - name string - span *types.Span - expected bool - }{ - { - name: "root span - no parent id", - span: &types.Span{ - Event: types.Event{ - Data: map[string]interface{}{}, - }, - }, - expected: true, - }, - { - name: "root span - empty parent id", - span: &types.Span{ - Event: types.Event{ - Data: map[string]interface{}{ - "trace.parent_id": "", - }, - }, - }, - expected: true, - }, - { - name: "non-root span - parent id", - span: &types.Span{ - Event: types.Event{ - Data: map[string]interface{}{ - "trace.parent_id": "some-id", - }, - }, - }, - expected: false, - }, - { - name: "non-root span - no parent id but has signal_type of log", - span: &types.Span{ - Event: types.Event{ - Data: map[string]interface{}{ - "meta.signal_type": "log", - }, - }, - }, - expected: false, - }, - } - - collector := &InMemCollector{ - Config: &config.MockConfig{ - ParentIdFieldNames: []string{"trace.parent_id", "parentId"}, - GetCollectionConfigVal: config.CollectionConfig{ - ShutdownDelay: config.Duration(1 * time.Millisecond), - }, - }, - } - - for _, tc := range tesCases { - t.Run(tc.name, func(t *testing.T) { - assert.Equal(t, tc.expected, collector.isRootSpan(tc.span)) - }) - } -} - func TestRedistributeTraces(t *testing.T) { conf := &config.MockConfig{ GetTracesConfigVal: config.TracesConfig{ @@ -1942,6 +1895,7 @@ func TestBigTracesGoEarly(t *testing.T) { Data: map[string]interface{}{}, APIKey: legacyAPIKey, }, + IsRoot: true, } coll.AddSpan(rootSpan) diff --git a/types/event.go b/types/event.go index c6fc76039b..f80849e69d 100644 --- a/types/event.go +++ b/types/event.go @@ -2,6 +2,7 @@ package types import ( "context" + "slices" "time" huskyotlp "github.com/honeycombio/husky/otlp" @@ -102,6 +103,12 @@ func (t *Trace) GetSpans() []*Span { return t.spans } +func (t *Trace) RemoveDecisionSpans() { + t.spans = slices.DeleteFunc(t.spans, func(sp *Span) bool { + return sp.IsDecisionSpan() + }) +} + func (t *Trace) ID() string { return t.TraceID } @@ -172,8 +179,8 @@ func (t *Trace) GetSamplerKey() (string, bool) { env := "" for _, sp := range t.GetSpans() { - if sp.Event.Environment != "" { - env = sp.Event.Environment + if sp.Environment != "" { + env = sp.Environment break } } @@ -187,6 +194,43 @@ type Span struct { TraceID string DataSize int ArrivalTime time.Time + IsRoot bool +} + +// IsDecicionSpan returns true if the span is a decision span based on +// a flag set in the span's metadata. +func (sp *Span) IsDecisionSpan() bool { + if sp.Data == nil { + return false + } + v, ok := sp.Data["meta.refinery.min_span"] + if !ok { + return false + } + isDecisionSpan, ok := v.(bool) + if !ok { + return false + } + + return isDecisionSpan +} + +// ExtractDecisionContext returns a new Event that contains only the data that is +// relevant to the decision-making process. +func (sp *Span) ExtractDecisionContext() *Event { + decisionCtx := sp.Event + dataSize := sp.DataSize + if dataSize == 0 { + dataSize = sp.GetDataSize() + } + decisionCtx.Data = map[string]interface{}{ + "trace_id": sp.TraceID, + "meta.refinery.root": sp.IsRoot, + "meta.refinery.min_span": true, + "meta.annotation_type": sp.AnnotationType(), + "meta.refinery.span_data_size": dataSize, + } + return &decisionCtx } // GetDataSize computes the size of the Data element of the Span. @@ -194,6 +238,12 @@ type Span struct { // relative ordering, not absolute calculations. func (sp *Span) GetDataSize() int { total := 0 + + if sp.IsDecisionSpan() { + if v, ok := sp.Data["meta.refinery.span_data_size"]; ok { + return v.(int) + } + } // the data types we should be getting from JSON are: // float64, int64, bool, string, []byte for _, v := range sp.Data { diff --git a/types/event_test.go b/types/event_test.go index 0f949b4639..b1ac8881ac 100644 --- a/types/event_test.go +++ b/types/event_test.go @@ -4,6 +4,9 @@ import ( "strconv" "strings" "testing" + "time" + + "github.com/stretchr/testify/assert" ) func TestSpan_GetDataSize(t *testing.T) { @@ -65,6 +68,69 @@ func TestSpan_AnnotationType(t *testing.T) { } } +func TestSpan_ExtractDecisionContext(t *testing.T) { + ev := Event{ + APIHost: "test.api.com", + APIKey: "test-api-key", + Dataset: "test-dataset", + Environment: "test-environment", + SampleRate: 5, + Timestamp: time.Now(), + Data: map[string]interface{}{ + "test": "test", + "meta.annotation_type": "span_event", + }, + } + sp := &Span{ + Event: ev, + TraceID: "test-trace-id", + ArrivalTime: time.Now(), + IsRoot: true, + } + + got := sp.ExtractDecisionContext() + assert.Equal(t, ev.APIHost, got.APIHost) + assert.Equal(t, ev.APIKey, got.APIKey) + assert.Equal(t, ev.Dataset, got.Dataset) + assert.Equal(t, ev.Environment, got.Environment) + assert.Equal(t, ev.SampleRate, got.SampleRate) + assert.Equal(t, ev.Timestamp, got.Timestamp) + assert.Equal(t, map[string]interface{}{ + "trace_id": sp.TraceID, + "meta.refinery.root": true, + "meta.refinery.min_span": true, + "meta.annotation_type": SpanAnnotationTypeSpanEvent, + "meta.refinery.span_data_size": 14, + }, got.Data) +} + +func TestSpan_IsDecisionSpan(t *testing.T) { + tests := []struct { + name string + data map[string]any + want bool + }{ + {"nil meta", nil, false}, + {"no meta", map[string]any{}, false}, + {"no meta.refinery.min_span", map[string]any{"meta.annotation_type": "span_event"}, false}, + {"invalid min_span", map[string]any{"meta.annotation_type": "span_event", "meta.refinery.mi_span": true}, false}, + {"is decision span", map[string]any{"meta.annotation_type": "span_event", "meta.refinery.min_span": true}, true}, + {"is not decision span", map[string]any{"meta.annotation_type": "span_event", "meta.refinery.min_span": false}, false}, + } + + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + sp := &Span{ + Event: Event{ + Data: tt.data, + }, + } + got := sp.IsDecisionSpan() + assert.Equal(t, tt.want, got) + }) + } +} + // These benchmarks were just to verify that the size calculation is acceptable // even on big spans. The P99 for normal (20-field) spans shows that it will take ~1 // microsecond (on an m1 laptop) but a 1000-field span (extremely rare!) will take From 6104c794f59b72fb2344e8fb3657840888e3bd19 Mon Sep 17 00:00:00 2001 From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com> Date: Tue, 1 Oct 2024 17:21:38 -0400 Subject: [PATCH 14/25] maint(deps): bump the minor-patch group with 13 updates (#1357) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Bumps the minor-patch group with 13 updates: | Package | From | To | | --- | --- | --- | | [github.com/agnivade/levenshtein](https://github.com/agnivade/levenshtein) | `1.1.1` | `1.2.0` | | [github.com/klauspost/compress](https://github.com/klauspost/compress) | `1.17.9` | `1.17.10` | | [github.com/prometheus/client_golang](https://github.com/prometheus/client_golang) | `1.20.2` | `1.20.4` | | [go.opentelemetry.io/otel](https://github.com/open-telemetry/opentelemetry-go) | `1.29.0` | `1.30.0` | | [go.opentelemetry.io/otel/exporters/otlp/otlpmetric/otlpmetrichttp](https://github.com/open-telemetry/opentelemetry-go) | `1.29.0` | `1.30.0` | | [go.opentelemetry.io/otel/exporters/otlp/otlptrace](https://github.com/open-telemetry/opentelemetry-go) | `1.29.0` | `1.30.0` | | [go.opentelemetry.io/otel/exporters/otlp/otlptrace/otlptracehttp](https://github.com/open-telemetry/opentelemetry-go) | `1.29.0` | `1.30.0` | | [go.opentelemetry.io/otel/metric](https://github.com/open-telemetry/opentelemetry-go) | `1.29.0` | `1.30.0` | | [go.opentelemetry.io/otel/sdk](https://github.com/open-telemetry/opentelemetry-go) | `1.29.0` | `1.30.0` | | [go.opentelemetry.io/otel/sdk/metric](https://github.com/open-telemetry/opentelemetry-go) | `1.29.0` | `1.30.0` | | [go.opentelemetry.io/otel/trace](https://github.com/open-telemetry/opentelemetry-go) | `1.29.0` | `1.30.0` | | [go.uber.org/automaxprocs](https://github.com/uber-go/automaxprocs) | `1.5.3` | `1.6.0` | | [google.golang.org/grpc](https://github.com/grpc/grpc-go) | `1.66.0` | `1.66.1` | Updates `github.com/agnivade/levenshtein` from 1.1.1 to 1.2.0
Commits
  • fac16fe Add tests on long strings with few different characters
  • 4e472bb Remove leading and trailing identical runes
  • 02603e0 Use min() builtin function, update Go minimum version to 1.21
  • 8c681eb Update dgryski/trifles to latest version
  • f8ac98b Update CI to use 1.23
  • aad2eb7 Update CI to use 1.20
  • 352e1e7 Update CI to 1.19
  • 221f3cb Include new Go versions
  • 95ae884 Updated badge
  • ab363df Added Github actions
  • See full diff in compare view

Updates `github.com/klauspost/compress` from 1.17.9 to 1.17.10
Release notes

Sourced from github.com/klauspost/compress's releases.

v1.17.10

What's Changed

New Contributors

Full Changelog: https://github.com/klauspost/compress/compare/v1.17.9...v1.17.10

Commits

Updates `github.com/prometheus/client_golang` from 1.20.2 to 1.20.4
Release notes

Sourced from github.com/prometheus/client_golang's releases.

v1.20.4

  • [BUGFIX] histograms: Fix a possible data race when appending exemplars vs metrics gather. #1623

v1.20.3

  • [BUGFIX] histograms: Fix possible data race when appending exemplars. #1608
Changelog

Sourced from github.com/prometheus/client_golang's changelog.

Unreleased

  • [BUGFIX] histograms: Fix possible data race when appending exemplars vs metrics gather. #1623

1.20.3 / 2024-09-05

  • [BUGFIX] histograms: Fix possible data race when appending exemplars. #1608
Commits
  • 05fcde9 Merge pull request #1623 from krajorama/data-race-in-histogram-write
  • 209f4c0 Add changelog
  • 1e398cc native histogram: Fix race between Write and addExemplar
  • ef2f87e Merge pull request #1620 from prometheus/arthursens/prepare-1.20.3
  • 937ac63 Add changelog entry for 1.20.3
  • 6e9914d Merge pull request #1608 from krajorama/index-out-of-range-native-histogram-e...
  • d6b8c89 Update comments with more explanations
  • 504566f Use simplified solution from #1609 for the data race
  • dc8e9a4 fix: native histogram: Simplify and fix addExemplar
  • dc819ce Use a trivial solution to #1605
  • Additional commits viewable in compare view

Updates `go.opentelemetry.io/otel` from 1.29.0 to 1.30.0
Changelog

Sourced from go.opentelemetry.io/otel's changelog.

[1.30.0/0.52.0/0.6.0/0.0.9] 2024-09-09

Added

  • Support OTEL_EXPORTER_OTLP_LOGS_INSECURE and OTEL_EXPORTER_OTLP_INSECURE environments in go.opentelemetry.io/otel/exporters/otlp/otlplog/otlploggrpc. (#5739)
  • The WithResource option for NewMeterProvider now merges the provided resources with the ones from environment variables. (#5773)
  • The WithResource option for NewLoggerProvider now merges the provided resources with the ones from environment variables. (#5773)
  • Add UTF-8 support to go.opentelemetry.io/otel/exporters/prometheus. (#5755)

Fixed

  • Fix memory leak in the global MeterProvider when identical instruments are repeatedly created. (#5754)
  • Fix panic on instruments creation when setting meter provider. (#5758)
  • Fix an issue where SetMeterProvider in go.opentelemetry.io/otel might miss the delegation for instruments and registries. (#5780)

Removed

Commits
  • ed4fc75 Release v1.30.0/v0.52.0/v0.6.0/v0.0.9 (#5797)
  • cdd2dbb Drop support for Go 1.21 in dice example (#5800)
  • e9ac0d2 fix(deps): update module google.golang.org/grpc to v1.66.1 (#5798)
  • 4cc9fee fix(deps): update golang.org/x/exp digest to 701f63a (#5795)
  • 71b341f Add utf8 support to the prometheus exporter (#5755)
  • 506a9ba Fix typos (#5763)
  • b37e8a9 SetMeterProvider might miss the delegation for instruments and registries (...
  • 9e1b015 fix(metric, log): merge explicit resource with environment variables (#5773)
  • 8dca9cc Support OTEL_EXPORTER_OTLP_LOGS_INSECURE and OTEL_EXPORTER_OTLP_INSECURE envi...
  • fb7cc02 fix(deps): update module github.com/prometheus/client_golang to v1.20.3 (#5788)
  • Additional commits viewable in compare view

Updates `go.opentelemetry.io/otel/exporters/otlp/otlpmetric/otlpmetrichttp` from 1.29.0 to 1.30.0
Changelog

Sourced from go.opentelemetry.io/otel/exporters/otlp/otlpmetric/otlpmetrichttp's changelog.

[1.30.0/0.52.0/0.6.0/0.0.9] 2024-09-09

Added

  • Support OTEL_EXPORTER_OTLP_LOGS_INSECURE and OTEL_EXPORTER_OTLP_INSECURE environments in go.opentelemetry.io/otel/exporters/otlp/otlplog/otlploggrpc. (#5739)
  • The WithResource option for NewMeterProvider now merges the provided resources with the ones from environment variables. (#5773)
  • The WithResource option for NewLoggerProvider now merges the provided resources with the ones from environment variables. (#5773)
  • Add UTF-8 support to go.opentelemetry.io/otel/exporters/prometheus. (#5755)

Fixed

  • Fix memory leak in the global MeterProvider when identical instruments are repeatedly created. (#5754)
  • Fix panic on instruments creation when setting meter provider. (#5758)
  • Fix an issue where SetMeterProvider in go.opentelemetry.io/otel might miss the delegation for instruments and registries. (#5780)

Removed

Commits
  • ed4fc75 Release v1.30.0/v0.52.0/v0.6.0/v0.0.9 (#5797)
  • cdd2dbb Drop support for Go 1.21 in dice example (#5800)
  • e9ac0d2 fix(deps): update module google.golang.org/grpc to v1.66.1 (#5798)
  • 4cc9fee fix(deps): update golang.org/x/exp digest to 701f63a (#5795)
  • 71b341f Add utf8 support to the prometheus exporter (#5755)
  • 506a9ba Fix typos (#5763)
  • b37e8a9 SetMeterProvider might miss the delegation for instruments and registries (...
  • 9e1b015 fix(metric, log): merge explicit resource with environment variables (#5773)
  • 8dca9cc Support OTEL_EXPORTER_OTLP_LOGS_INSECURE and OTEL_EXPORTER_OTLP_INSECURE envi...
  • fb7cc02 fix(deps): update module github.com/prometheus/client_golang to v1.20.3 (#5788)
  • Additional commits viewable in compare view

Updates `go.opentelemetry.io/otel/exporters/otlp/otlptrace` from 1.29.0 to 1.30.0
Changelog

Sourced from go.opentelemetry.io/otel/exporters/otlp/otlptrace's changelog.

[1.30.0/0.52.0/0.6.0/0.0.9] 2024-09-09

Added

  • Support OTEL_EXPORTER_OTLP_LOGS_INSECURE and OTEL_EXPORTER_OTLP_INSECURE environments in go.opentelemetry.io/otel/exporters/otlp/otlplog/otlploggrpc. (#5739)
  • The WithResource option for NewMeterProvider now merges the provided resources with the ones from environment variables. (#5773)
  • The WithResource option for NewLoggerProvider now merges the provided resources with the ones from environment variables. (#5773)
  • Add UTF-8 support to go.opentelemetry.io/otel/exporters/prometheus. (#5755)

Fixed

  • Fix memory leak in the global MeterProvider when identical instruments are repeatedly created. (#5754)
  • Fix panic on instruments creation when setting meter provider. (#5758)
  • Fix an issue where SetMeterProvider in go.opentelemetry.io/otel might miss the delegation for instruments and registries. (#5780)

Removed

Commits
  • ed4fc75 Release v1.30.0/v0.52.0/v0.6.0/v0.0.9 (#5797)
  • cdd2dbb Drop support for Go 1.21 in dice example (#5800)
  • e9ac0d2 fix(deps): update module google.golang.org/grpc to v1.66.1 (#5798)
  • 4cc9fee fix(deps): update golang.org/x/exp digest to 701f63a (#5795)
  • 71b341f Add utf8 support to the prometheus exporter (#5755)
  • 506a9ba Fix typos (#5763)
  • b37e8a9 SetMeterProvider might miss the delegation for instruments and registries (...
  • 9e1b015 fix(metric, log): merge explicit resource with environment variables (#5773)
  • 8dca9cc Support OTEL_EXPORTER_OTLP_LOGS_INSECURE and OTEL_EXPORTER_OTLP_INSECURE envi...
  • fb7cc02 fix(deps): update module github.com/prometheus/client_golang to v1.20.3 (#5788)
  • Additional commits viewable in compare view

Updates `go.opentelemetry.io/otel/exporters/otlp/otlptrace/otlptracehttp` from 1.29.0 to 1.30.0
Changelog

Sourced from go.opentelemetry.io/otel/exporters/otlp/otlptrace/otlptracehttp's changelog.

[1.30.0/0.52.0/0.6.0/0.0.9] 2024-09-09

Added

  • Support OTEL_EXPORTER_OTLP_LOGS_INSECURE and OTEL_EXPORTER_OTLP_INSECURE environments in go.opentelemetry.io/otel/exporters/otlp/otlplog/otlploggrpc. (#5739)
  • The WithResource option for NewMeterProvider now merges the provided resources with the ones from environment variables. (#5773)
  • The WithResource option for NewLoggerProvider now merges the provided resources with the ones from environment variables. (#5773)
  • Add UTF-8 support to go.opentelemetry.io/otel/exporters/prometheus. (#5755)

Fixed

  • Fix memory leak in the global MeterProvider when identical instruments are repeatedly created. (#5754)
  • Fix panic on instruments creation when setting meter provider. (#5758)
  • Fix an issue where SetMeterProvider in go.opentelemetry.io/otel might miss the delegation for instruments and registries. (#5780)

Removed

Commits
  • ed4fc75 Release v1.30.0/v0.52.0/v0.6.0/v0.0.9 (#5797)
  • cdd2dbb Drop support for Go 1.21 in dice example (#5800)
  • e9ac0d2 fix(deps): update module google.golang.org/grpc to v1.66.1 (#5798)
  • 4cc9fee fix(deps): update golang.org/x/exp digest to 701f63a (#5795)
  • 71b341f Add utf8 support to the prometheus exporter (#5755)
  • 506a9ba Fix typos (#5763)
  • b37e8a9 SetMeterProvider might miss the delegation for instruments and registries (...
  • 9e1b015 fix(metric, log): merge explicit resource with environment variables (#5773)
  • 8dca9cc Support OTEL_EXPORTER_OTLP_LOGS_INSECURE and OTEL_EXPORTER_OTLP_INSECURE envi...
  • fb7cc02 fix(deps): update module github.com/prometheus/client_golang to v1.20.3 (#5788)
  • Additional commits viewable in compare view

Updates `go.opentelemetry.io/otel/metric` from 1.29.0 to 1.30.0
Changelog

Sourced from go.opentelemetry.io/otel/metric's changelog.

[1.30.0/0.52.0/0.6.0/0.0.9] 2024-09-09

Added

  • Support OTEL_EXPORTER_OTLP_LOGS_INSECURE and OTEL_EXPORTER_OTLP_INSECURE environments in go.opentelemetry.io/otel/exporters/otlp/otlplog/otlploggrpc. (#5739)
  • The WithResource option for NewMeterProvider now merges the provided resources with the ones from environment variables. (#5773)
  • The WithResource option for NewLoggerProvider now merges the provided resources with the ones from environment variables. (#5773)
  • Add UTF-8 support to go.opentelemetry.io/otel/exporters/prometheus. (#5755)

Fixed

  • Fix memory leak in the global MeterProvider when identical instruments are repeatedly created. (#5754)
  • Fix panic on instruments creation when setting meter provider. (#5758)
  • Fix an issue where SetMeterProvider in go.opentelemetry.io/otel might miss the delegation for instruments and registries. (#5780)

Removed

Commits
  • ed4fc75 Release v1.30.0/v0.52.0/v0.6.0/v0.0.9 (#5797)
  • cdd2dbb Drop support for Go 1.21 in dice example (#5800)
  • e9ac0d2 fix(deps): update module google.golang.org/grpc to v1.66.1 (#5798)
  • 4cc9fee fix(deps): update golang.org/x/exp digest to 701f63a (#5795)
  • 71b341f Add utf8 support to the prometheus exporter (#5755)
  • 506a9ba Fix typos (#5763)
  • b37e8a9 SetMeterProvider might miss the delegation for instruments and registries (...
  • 9e1b015 fix(metric, log): merge explicit resource with environment variables (#5773)
  • 8dca9cc Support OTEL_EXPORTER_OTLP_LOGS_INSECURE and OTEL_EXPORTER_OTLP_INSECURE envi...
  • fb7cc02 fix(deps): update module github.com/prometheus/client_golang to v1.20.3 (#5788)
  • Additional commits viewable in compare view

Updates `go.opentelemetry.io/otel/sdk` from 1.29.0 to 1.30.0
Changelog

Sourced from go.opentelemetry.io/otel/sdk's changelog.

[1.30.0/0.52.0/0.6.0/0.0.9] 2024-09-09

Added

  • Support OTEL_EXPORTER_OTLP_LOGS_INSECURE and OTEL_EXPORTER_OTLP_INSECURE environments in go.opentelemetry.io/otel/exporters/otlp/otlplog/otlploggrpc. (#5739)
  • The WithResource option for NewMeterProvider now merges the provided resources with the ones from environment variables. (#5773)
  • The WithResource option for NewLoggerProvider now merges the provided resources with the ones from environment variables. (#5773)
  • Add UTF-8 support to go.opentelemetry.io/otel/exporters/prometheus. (#5755)

Fixed

  • Fix memory leak in the global MeterProvider when identical instruments are repeatedly created. (#5754)
  • Fix panic on instruments creation when setting meter provider. (#5758)
  • Fix an issue where SetMeterProvider in go.opentelemetry.io/otel might miss the delegation for instruments and registries. (#5780)

Removed

Commits
  • ed4fc75 Release v1.30.0/v0.52.0/v0.6.0/v0.0.9 (#5797)
  • cdd2dbb Drop support for Go 1.21 in dice example (#5800)
  • e9ac0d2 fix(deps): update module google.golang.org/grpc to v1.66.1 (#5798)
  • 4cc9fee fix(deps): update golang.org/x/exp digest to 701f63a (#5795)
  • 71b341f Add utf8 support to the prometheus exporter (#5755)
  • 506a9ba Fix typos (#5763)
  • b37e8a9 SetMeterProvider might miss the delegation for instruments and registries (...
  • 9e1b015 fix(metric, log): merge explicit resource with environment variables (#5773)
  • 8dca9cc Support OTEL_EXPORTER_OTLP_LOGS_INSECURE and OTEL_EXPORTER_OTLP_INSECURE envi...
  • fb7cc02 fix(deps): update module github.com/prometheus/client_golang to v1.20.3 (#5788)
  • Additional commits viewable in compare view

Updates `go.opentelemetry.io/otel/sdk/metric` from 1.29.0 to 1.30.0
Changelog

Sourced from go.opentelemetry.io/otel/sdk/metric's changelog.

[1.30.0/0.52.0/0.6.0/0.0.9] 2024-09-09

Added

  • Support OTEL_EXPORTER_OTLP_LOGS_INSECURE and OTEL_EXPORTER_OTLP_INSECURE environments in go.opentelemetry.io/otel/exporters/otlp/otlplog/otlploggrpc. (#5739)
  • The WithResource option for NewMeterProvider now merges the provided resources with the ones from environment variables. (#5773)
  • The WithResource option for NewLoggerProvider now merges the provided resources with the ones from environment variables. (#5773)
  • Add UTF-8 support to go.opentelemetry.io/otel/exporters/prometheus. (#5755)

Fixed

  • Fix memory leak in the global MeterProvider when identical instruments are repeatedly created. (#5754)
  • Fix panic on instruments creation when setting meter provider. (#5758)
  • Fix an issue where SetMeterProvider in go.opentelemetry.io/otel might miss the delegation for instruments and registries. (#5780)

Removed

Commits
  • ed4fc75 Release v1.30.0/v0.52.0/v0.6.0/v0.0.9 (#5797)
  • cdd2dbb Drop support for Go 1.21 in dice example (#5800)
  • e9ac0d2 fix(deps): update module google.golang.org/grpc to v1.66.1 (#5798)
  • 4cc9fee fix(deps): update golang.org/x/exp digest to 701f63a (#5795)
  • 71b341f Add utf8 support to the prometheus exporter (#5755)
  • 506a9ba Fix typos (#5763)
  • b37e8a9 SetMeterProvider might miss the delegation for instruments and registries (...
  • 9e1b015 fix(metric, log): merge explicit resource with environment variables (#5773)
  • 8dca9cc Support OTEL_EXPORTER_OTLP_LOGS_INSECURE and OTEL_EXPORTER_OTLP_INSECURE envi...
  • fb7cc02 fix(deps): update module github.com/prometheus/client_golang to v1.20.3 (#5788)
  • Additional commits viewable in compare view

Updates `go.opentelemetry.io/otel/trace` from 1.29.0 to 1.30.0
Changelog

Sourced from go.opentelemetry.io/otel/trace's changelog.

[1.30.0/0.52.0/0.6.0/0.0.9] 2024-09-09

Added

  • Support OTEL_EXPORTER_OTLP_LOGS_INSECURE and OTEL_EXPORTER_OTLP_INSECURE environments in go.opentelemetry.io/otel/exporters/otlp/otlplog/otlploggrpc. (#5739)
  • The WithResource option for NewMeterProvider now merges the provided resources with the ones from environment variables. (#5773)
  • The WithResource option for NewLoggerProvider now merges the provided resources with the ones from environment variables. (#5773)
  • Add UTF-8 support to go.opentelemetry.io/otel/exporters/prometheus. (#5755)

Fixed

  • Fix memory leak in the global MeterProvider when identical instruments are repeatedly created. (#5754)
  • Fix panic on instruments creation when setting meter provider. (#5758)
  • Fix an issue where SetMeterProvider in go.opentelemetry.io/otel might miss the delegation for instruments and registries. (#5780)

Removed

Commits
  • ed4fc75 Release v1.30.0/v0.52.0/v0.6.0/v0.0.9 (#5797)
  • cdd2dbb Drop support for Go 1.21 in dice example (#5800)
  • e9ac0d2 fix(deps): update module google.golang.org/grpc to v1.66.1 (#5798)
  • 4cc9fee fix(deps): update golang.org/x/exp digest to 701f63a (#5795)
  • 71b341f Add utf8 support to the prometheus exporter (#5755)
  • 506a9ba Fix typos (#5763)
  • b37e8a9 SetMeterProvider might miss the delegation for instruments and registries (...
  • 9e1b015 fix(metric, log): merge explicit resource with environment variables (#5773)
  • 8dca9cc Support OTEL_EXPORTER_OTLP_LOGS_INSECURE and OTEL_EXPORTER_OTLP_INSECURE envi...
  • fb7cc02 fix(deps): update module github.com/prometheus/client_golang to v1.20.3 (#5788)
  • Additional commits viewable in compare view

Updates `go.uber.org/automaxprocs` from 1.5.3 to 1.6.0
Release notes

Sourced from go.uber.org/automaxprocs's releases.

v1.6.0

  • Add RoundQuotaFunc option that allows configuration of rounding behavior for floating point CPU quota.
Changelog

Sourced from go.uber.org/automaxprocs's changelog.

v1.6.0 (2024-07-24)

  • Add RoundQuotaFunc option that allows configuration of rounding behavior for floating point CPU quota.
Commits

Updates `google.golang.org/grpc` from 1.66.0 to 1.66.1
Commits

Dependabot will resolve any conflicts with this PR as long as you don't alter it yourself. You can also trigger a rebase manually by commenting `@dependabot rebase`. [//]: # (dependabot-automerge-start) [//]: # (dependabot-automerge-end) ---
Dependabot commands and options
You can trigger Dependabot actions by commenting on this PR: - `@dependabot rebase` will rebase this PR - `@dependabot recreate` will recreate this PR, overwriting any edits that have been made to it - `@dependabot merge` will merge this PR after your CI passes on it - `@dependabot squash and merge` will squash and merge this PR after your CI passes on it - `@dependabot cancel merge` will cancel a previously requested merge and block automerging - `@dependabot reopen` will reopen this PR if it is closed - `@dependabot close` will close this PR and stop Dependabot recreating it. You can achieve the same result by closing it manually - `@dependabot show ignore conditions` will show all of the ignore conditions of the specified dependency - `@dependabot ignore major version` will close this group update PR and stop Dependabot creating any more for the specific dependency's major version (unless you unignore this specific dependency's major version or upgrade to it yourself) - `@dependabot ignore minor version` will close this group update PR and stop Dependabot creating any more for the specific dependency's minor version (unless you unignore this specific dependency's minor version or upgrade to it yourself) - `@dependabot ignore ` will close this group update PR and stop Dependabot creating any more for the specific dependency (unless you unignore this specific dependency or upgrade to it yourself) - `@dependabot unignore ` will remove all of the ignore conditions of the specified dependency - `@dependabot unignore ` will remove the ignore condition of the specified dependency and ignore conditions
Signed-off-by: dependabot[bot] Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com> --- go.mod | 30 +++++++++++++-------------- go.sum | 64 +++++++++++++++++++++++++++++----------------------------- 2 files changed, 47 insertions(+), 47 deletions(-) diff --git a/go.mod b/go.mod index cc19f7fbea..e2ee331dd4 100644 --- a/go.mod +++ b/go.mod @@ -3,7 +3,7 @@ module github.com/honeycombio/refinery go 1.22.5 require ( - github.com/agnivade/levenshtein v1.1.1 + github.com/agnivade/levenshtein v1.2.0 github.com/creasty/defaults v1.8.0 github.com/davecgh/go-spew v1.1.1 github.com/dgryski/go-wyhash v0.0.0-20191203203029-c4841ae36371 @@ -19,11 +19,11 @@ require ( github.com/jessevdk/go-flags v1.6.1 github.com/jonboulle/clockwork v0.4.0 github.com/json-iterator/go v1.1.12 - github.com/klauspost/compress v1.17.9 + github.com/klauspost/compress v1.17.10 github.com/panmari/cuckoofilter v1.0.6 github.com/pelletier/go-toml/v2 v2.2.3 github.com/pkg/errors v0.9.1 - github.com/prometheus/client_golang v1.20.2 + github.com/prometheus/client_golang v1.20.4 github.com/rcrowley/go-metrics v0.0.0-20200313005456-10cdbea86bc0 github.com/redis/go-redis/v9 v9.6.1 github.com/sirupsen/logrus v1.9.3 @@ -31,18 +31,18 @@ require ( github.com/stretchr/testify v1.9.0 github.com/tidwall/gjson v1.17.3 github.com/vmihailenco/msgpack/v5 v5.4.1 - go.opentelemetry.io/otel v1.29.0 - go.opentelemetry.io/otel/exporters/otlp/otlpmetric/otlpmetrichttp v1.29.0 - go.opentelemetry.io/otel/exporters/otlp/otlptrace v1.29.0 - go.opentelemetry.io/otel/exporters/otlp/otlptrace/otlptracehttp v1.29.0 - go.opentelemetry.io/otel/metric v1.29.0 - go.opentelemetry.io/otel/sdk v1.29.0 - go.opentelemetry.io/otel/sdk/metric v1.29.0 - go.opentelemetry.io/otel/trace v1.29.0 + go.opentelemetry.io/otel v1.30.0 + go.opentelemetry.io/otel/exporters/otlp/otlpmetric/otlpmetrichttp v1.30.0 + go.opentelemetry.io/otel/exporters/otlp/otlptrace v1.30.0 + go.opentelemetry.io/otel/exporters/otlp/otlptrace/otlptracehttp v1.30.0 + go.opentelemetry.io/otel/metric v1.30.0 + go.opentelemetry.io/otel/sdk v1.30.0 + go.opentelemetry.io/otel/sdk/metric v1.30.0 + go.opentelemetry.io/otel/trace v1.30.0 go.opentelemetry.io/proto/otlp v1.3.1 - go.uber.org/automaxprocs v1.5.3 + go.uber.org/automaxprocs v1.6.0 golang.org/x/exp v0.0.0-20231127185646-65229373498e - google.golang.org/grpc v1.66.0 + google.golang.org/grpc v1.66.1 google.golang.org/protobuf v1.34.2 gopkg.in/alexcesaro/statsd.v2 v2.0.0 gopkg.in/yaml.v3 v3.0.1 @@ -82,8 +82,8 @@ require ( golang.org/x/sys v0.25.0 // indirect golang.org/x/text v0.18.0 // indirect golang.org/x/tools v0.25.0 - google.golang.org/genproto/googleapis/api v0.0.0-20240822170219-fc7c04adadcd // indirect - google.golang.org/genproto/googleapis/rpc v0.0.0-20240822170219-fc7c04adadcd // indirect + google.golang.org/genproto/googleapis/api v0.0.0-20240903143218-8af14fe29dc1 // indirect + google.golang.org/genproto/googleapis/rpc v0.0.0-20240903143218-8af14fe29dc1 // indirect ) replace go.opentelemetry.io/proto/otlp => github.com/honeycombio/opentelemetry-proto-go/otlp v1.3.1-compat diff --git a/go.sum b/go.sum index a27c397e87..b5b25d05df 100644 --- a/go.sum +++ b/go.sum @@ -1,7 +1,7 @@ github.com/DataDog/zstd v1.5.5 h1:oWf5W7GtOLgp6bciQYDmhHHjdhYkALu6S/5Ni9ZgSvQ= github.com/DataDog/zstd v1.5.5/go.mod h1:g4AWEaM3yOg3HYfnJ3YIawPnVdXJh9QME85blwSAmyw= -github.com/agnivade/levenshtein v1.1.1 h1:QY8M92nrzkmr798gCo3kmMyqXFzdQVpxLlGPRBij0P8= -github.com/agnivade/levenshtein v1.1.1/go.mod h1:veldBMzWxcCG2ZvUTKD2kJNRdCk5hVbJomOvKkmgYbo= +github.com/agnivade/levenshtein v1.2.0 h1:U9L4IOT0Y3i0TIlUIDJ7rVUziKi/zPbrJGaFrtYH3SY= +github.com/agnivade/levenshtein v1.2.0/go.mod h1:QVVI16kDrtSuwcpd0p1+xMC6Z/VfhtCyDIjcwga4/DU= github.com/arbovm/levenshtein v0.0.0-20160628152529-48b4e1c0c4d0 h1:jfIu9sQUG6Ig+0+Ap1h4unLjW6YQJpKZVmUzxsD4E/Q= github.com/arbovm/levenshtein v0.0.0-20160628152529-48b4e1c0c4d0/go.mod h1:t2tdKJDJF9BV14lnkjHmOQgcvEKgtqs5a1N3LNdJhGE= github.com/beorn7/perks v1.0.1 h1:VlbKKnNfV8bJzeqoa4cOKqO6bYr3WgKZxO8Z16+hsOM= @@ -25,8 +25,8 @@ github.com/dgryski/go-rendezvous v0.0.0-20200823014737-9f7001d12a5f h1:lO4WD4F/r github.com/dgryski/go-rendezvous v0.0.0-20200823014737-9f7001d12a5f/go.mod h1:cuUVRXasLTGF7a8hSLbxyZXjz+1KgoB3wDUb6vlszIc= github.com/dgryski/go-wyhash v0.0.0-20191203203029-c4841ae36371 h1:bz5ApY1kzFBvw3yckuyRBCtqGvprWrKswYK468nm+Gs= github.com/dgryski/go-wyhash v0.0.0-20191203203029-c4841ae36371/go.mod h1:/ENMIO1SQeJ5YQeUWWpbX8f+bS8INHrrhFjXgEqi4LA= -github.com/dgryski/trifles v0.0.0-20200323201526-dd97f9abfb48 h1:fRzb/w+pyskVMQ+UbP35JkH8yB7MYb4q/qhBarqZE6g= -github.com/dgryski/trifles v0.0.0-20200323201526-dd97f9abfb48/go.mod h1:if7Fbed8SFyPtHLHbg49SI7NAdJiC5WIA09pe59rfAA= +github.com/dgryski/trifles v0.0.0-20230903005119-f50d829f2e54 h1:SG7nF6SRlWhcT7cNTs5R6Hk4V2lcmLz2NsG2VnInyNo= +github.com/dgryski/trifles v0.0.0-20230903005119-f50d829f2e54/go.mod h1:if7Fbed8SFyPtHLHbg49SI7NAdJiC5WIA09pe59rfAA= github.com/facebookgo/clock v0.0.0-20150410010913-600d898af40a h1:yDWHCSQ40h88yih2JAcL6Ls/kVkSE8GFACTGVnMPruw= github.com/facebookgo/clock v0.0.0-20150410010913-600d898af40a/go.mod h1:7Ga40egUymuWXxAe151lTNnCv97MddSOVsjpPPkityA= github.com/facebookgo/ensure v0.0.0-20200202191622-63f1cf65ac4c h1:8ISkoahWXwZR41ois5lSJBSVw4D0OV19Ht/JSTzvSv0= @@ -80,8 +80,8 @@ github.com/jonboulle/clockwork v0.4.0 h1:p4Cf1aMWXnXAUh8lVfewRBx1zaTSYKrKMF2g3ST github.com/jonboulle/clockwork v0.4.0/go.mod h1:xgRqUGwRcjKCO1vbZUEtSLrqKoPSsUpK7fnezOII0kc= github.com/json-iterator/go v1.1.12 h1:PV8peI4a0ysnczrg+LtxykD8LfKY9ML6u2jnxaEnrnM= github.com/json-iterator/go v1.1.12/go.mod h1:e30LSqwooZae/UwlEbR2852Gd8hjQvJoHmT4TnhNGBo= -github.com/klauspost/compress v1.17.9 h1:6KIumPrER1LHsvBVuDa0r5xaG0Es51mhhB9BQB2qeMA= -github.com/klauspost/compress v1.17.9/go.mod h1:Di0epgTjJY877eYKx5yC51cX2A2Vl2ibi7bDH9ttBbw= +github.com/klauspost/compress v1.17.10 h1:oXAz+Vh0PMUvJczoi+flxpnBEPxoER1IaAnU/NMPtT0= +github.com/klauspost/compress v1.17.10/go.mod h1:pMDklpSncoRMuLFrf1W9Ss9KT+0rH90U12bZKk7uwG0= github.com/kr/pretty v0.3.1 h1:flRD4NNwYAUpkphVc1HcthR4KEIFJ65n8Mw5qdRn3LE= github.com/kr/pretty v0.3.1/go.mod h1:hoEshYVHaxMs3cyo3Yncou5ZscifuDolrwPKZanG3xk= github.com/kr/text v0.2.0 h1:5Nx0Ya0ZqY2ygV366QzturHI13Jq95ApcVaJBhpS+AY= @@ -105,8 +105,8 @@ github.com/pmezard/go-difflib v1.0.0 h1:4DBwDE0NGyQoBHbLQYPwSUPoCMWR5BEzIk/f1lZb github.com/pmezard/go-difflib v1.0.0/go.mod h1:iKH77koFhYxTK1pcRnkKkqfTogsbg7gZNVY4sRDYZ/4= github.com/prashantv/gostub v1.1.0 h1:BTyx3RfQjRHnUWaGF9oQos79AlQ5k8WNktv7VGvVH4g= github.com/prashantv/gostub v1.1.0/go.mod h1:A5zLQHz7ieHGG7is6LLXLz7I8+3LZzsrV0P1IAHhP5U= -github.com/prometheus/client_golang v1.20.2 h1:5ctymQzZlyOON1666svgwn3s6IKWgfbjsejTMiXIyjg= -github.com/prometheus/client_golang v1.20.2/go.mod h1:PIEt8X02hGcP8JWbeHyeZ53Y/jReSnHgO035n//V5WE= +github.com/prometheus/client_golang v1.20.4 h1:Tgh3Yr67PaOv/uTqloMsCEdeuFTatm5zIq5+qNN23vI= +github.com/prometheus/client_golang v1.20.4/go.mod h1:PIEt8X02hGcP8JWbeHyeZ53Y/jReSnHgO035n//V5WE= github.com/prometheus/client_model v0.6.1 h1:ZKSh/rekM+n3CeS952MLRAdFwIKqeY8b62p8ais2e9E= github.com/prometheus/client_model v0.6.1/go.mod h1:OrxVMOVHjw3lKMa8+x6HeMGkHMQyHDk9E3jmP2AmGiY= github.com/prometheus/common v0.55.0 h1:KEi6DK7lXW/m7Ig5i47x0vRzuBsHuvJdi5ee6Y3G1dc= @@ -138,24 +138,24 @@ github.com/vmihailenco/msgpack/v5 v5.4.1 h1:cQriyiUvjTwOHg8QZaPihLWeRAAVoCpE00IU github.com/vmihailenco/msgpack/v5 v5.4.1/go.mod h1:GaZTsDaehaPpQVyxrf5mtQlH+pc21PIudVV/E3rRQok= github.com/vmihailenco/tagparser/v2 v2.0.0 h1:y09buUbR+b5aycVFQs/g70pqKVZNBmxwAhO7/IwNM9g= github.com/vmihailenco/tagparser/v2 v2.0.0/go.mod h1:Wri+At7QHww0WTrCBeu4J6bNtoV6mEfg5OIWRZA9qds= -go.opentelemetry.io/otel v1.29.0 h1:PdomN/Al4q/lN6iBJEN3AwPvUiHPMlt93c8bqTG5Llw= -go.opentelemetry.io/otel v1.29.0/go.mod h1:N/WtXPs1CNCUEx+Agz5uouwCba+i+bJGFicT8SR4NP8= -go.opentelemetry.io/otel/exporters/otlp/otlpmetric/otlpmetrichttp v1.29.0 h1:xvhQxJ/C9+RTnAj5DpTg7LSM1vbbMTiXt7e9hsfqHNw= -go.opentelemetry.io/otel/exporters/otlp/otlpmetric/otlpmetrichttp v1.29.0/go.mod h1:Fcvs2Bz1jkDM+Wf5/ozBGmi3tQ/c9zPKLnsipnfhGAo= -go.opentelemetry.io/otel/exporters/otlp/otlptrace v1.29.0 h1:dIIDULZJpgdiHz5tXrTgKIMLkus6jEFa7x5SOKcyR7E= -go.opentelemetry.io/otel/exporters/otlp/otlptrace v1.29.0/go.mod h1:jlRVBe7+Z1wyxFSUs48L6OBQZ5JwH2Hg/Vbl+t9rAgI= -go.opentelemetry.io/otel/exporters/otlp/otlptrace/otlptracehttp v1.29.0 h1:JAv0Jwtl01UFiyWZEMiJZBiTlv5A50zNs8lsthXqIio= -go.opentelemetry.io/otel/exporters/otlp/otlptrace/otlptracehttp v1.29.0/go.mod h1:QNKLmUEAq2QUbPQUfvw4fmv0bgbK7UlOSFCnXyfvSNc= -go.opentelemetry.io/otel/metric v1.29.0 h1:vPf/HFWTNkPu1aYeIsc98l4ktOQaL6LeSoeV2g+8YLc= -go.opentelemetry.io/otel/metric v1.29.0/go.mod h1:auu/QWieFVWx+DmQOUMgj0F8LHWdgalxXqvp7BII/W8= -go.opentelemetry.io/otel/sdk v1.29.0 h1:vkqKjk7gwhS8VaWb0POZKmIEDimRCMsopNYnriHyryo= -go.opentelemetry.io/otel/sdk v1.29.0/go.mod h1:pM8Dx5WKnvxLCb+8lG1PRNIDxu9g9b9g59Qr7hfAAok= -go.opentelemetry.io/otel/sdk/metric v1.29.0 h1:K2CfmJohnRgvZ9UAj2/FhIf/okdWcNdBwe1m8xFXiSY= -go.opentelemetry.io/otel/sdk/metric v1.29.0/go.mod h1:6zZLdCl2fkauYoZIOn/soQIDSWFmNSRcICarHfuhNJQ= -go.opentelemetry.io/otel/trace v1.29.0 h1:J/8ZNK4XgR7a21DZUAsbF8pZ5Jcw1VhACmnYt39JTi4= -go.opentelemetry.io/otel/trace v1.29.0/go.mod h1:eHl3w0sp3paPkYstJOmAimxhiFXPg+MMTlEh3nsQgWQ= -go.uber.org/automaxprocs v1.5.3 h1:kWazyxZUrS3Gs4qUpbwo5kEIMGe/DAvi5Z4tl2NW4j8= -go.uber.org/automaxprocs v1.5.3/go.mod h1:eRbA25aqJrxAbsLO0xy5jVwPt7FQnRgjW+efnwa1WM0= +go.opentelemetry.io/otel v1.30.0 h1:F2t8sK4qf1fAmY9ua4ohFS/K+FUuOPemHUIXHtktrts= +go.opentelemetry.io/otel v1.30.0/go.mod h1:tFw4Br9b7fOS+uEao81PJjVMjW/5fvNCbpsDIXqP0pc= +go.opentelemetry.io/otel/exporters/otlp/otlpmetric/otlpmetrichttp v1.30.0 h1:VrMAbeJz4gnVDg2zEzjHG4dEH86j4jO6VYB+NgtGD8s= +go.opentelemetry.io/otel/exporters/otlp/otlpmetric/otlpmetrichttp v1.30.0/go.mod h1:qqN/uFdpeitTvm+JDqqnjm517pmQRYxTORbETHq5tOc= +go.opentelemetry.io/otel/exporters/otlp/otlptrace v1.30.0 h1:lsInsfvhVIfOI6qHVyysXMNDnjO9Npvl7tlDPJFBVd4= +go.opentelemetry.io/otel/exporters/otlp/otlptrace v1.30.0/go.mod h1:KQsVNh4OjgjTG0G6EiNi1jVpnaeeKsKMRwbLN+f1+8M= +go.opentelemetry.io/otel/exporters/otlp/otlptrace/otlptracehttp v1.30.0 h1:umZgi92IyxfXd/l4kaDhnKgY8rnN/cZcF1LKc6I8OQ8= +go.opentelemetry.io/otel/exporters/otlp/otlptrace/otlptracehttp v1.30.0/go.mod h1:4lVs6obhSVRb1EW5FhOuBTyiQhtRtAnnva9vD3yRfq8= +go.opentelemetry.io/otel/metric v1.30.0 h1:4xNulvn9gjzo4hjg+wzIKG7iNFEaBMX00Qd4QIZs7+w= +go.opentelemetry.io/otel/metric v1.30.0/go.mod h1:aXTfST94tswhWEb+5QjlSqG+cZlmyXy/u8jFpor3WqQ= +go.opentelemetry.io/otel/sdk v1.30.0 h1:cHdik6irO49R5IysVhdn8oaiR9m8XluDaJAs4DfOrYE= +go.opentelemetry.io/otel/sdk v1.30.0/go.mod h1:p14X4Ok8S+sygzblytT1nqG98QG2KYKv++HE0LY/mhg= +go.opentelemetry.io/otel/sdk/metric v1.30.0 h1:QJLT8Pe11jyHBHfSAgYH7kEmT24eX792jZO1bo4BXkM= +go.opentelemetry.io/otel/sdk/metric v1.30.0/go.mod h1:waS6P3YqFNzeP01kuo/MBBYqaoBJl7efRQHOaydhy1Y= +go.opentelemetry.io/otel/trace v1.30.0 h1:7UBkkYzeg3C7kQX8VAidWh2biiQbtAKjyIML8dQ9wmc= +go.opentelemetry.io/otel/trace v1.30.0/go.mod h1:5EyKqTzzmyqB9bwtCCq6pDLktPK6fmGf/Dph+8VI02o= +go.uber.org/automaxprocs v1.6.0 h1:O3y2/QNTOdbF+e/dpXNNW7Rx2hZ4sTIPyybbxyNqTUs= +go.uber.org/automaxprocs v1.6.0/go.mod h1:ifeIMSnPZuznNm6jmdzmU3/bfk01Fe2fotchwEFJ8r8= go.uber.org/multierr v1.11.0 h1:blXXJkSxSSfBVBlC76pxqeO+LN3aDfLQo+309xJstO0= go.uber.org/multierr v1.11.0/go.mod h1:20+QtiLqy0Nd6FdQB9TLXag12DsQkrbs3htMFfDN80Y= golang.org/x/exp v0.0.0-20231127185646-65229373498e h1:Gvh4YaCaXNs6dKTlfgismwWZKyjVZXwOPfIyUaqU3No= @@ -173,12 +173,12 @@ golang.org/x/text v0.18.0 h1:XvMDiNzPAl0jr17s6W9lcaIhGUfUORdGCNsuLmPG224= golang.org/x/text v0.18.0/go.mod h1:BuEKDfySbSR4drPmRPG/7iBdf8hvFMuRexcpahXilzY= golang.org/x/tools v0.25.0 h1:oFU9pkj/iJgs+0DT+VMHrx+oBKs/LJMV+Uvg78sl+fE= golang.org/x/tools v0.25.0/go.mod h1:/vtpO8WL1N9cQC3FN5zPqb//fRXskFHbLKk4OW1Q7rg= -google.golang.org/genproto/googleapis/api v0.0.0-20240822170219-fc7c04adadcd h1:BBOTEWLuuEGQy9n1y9MhVJ9Qt0BDu21X8qZs71/uPZo= -google.golang.org/genproto/googleapis/api v0.0.0-20240822170219-fc7c04adadcd/go.mod h1:fO8wJzT2zbQbAjbIoos1285VfEIYKDDY+Dt+WpTkh6g= -google.golang.org/genproto/googleapis/rpc v0.0.0-20240822170219-fc7c04adadcd h1:6TEm2ZxXoQmFWFlt1vNxvVOa1Q0dXFQD1m/rYjXmS0E= -google.golang.org/genproto/googleapis/rpc v0.0.0-20240822170219-fc7c04adadcd/go.mod h1:UqMtugtsSgubUsoxbuAoiCXvqvErP7Gf0so0mK9tHxU= -google.golang.org/grpc v1.66.0 h1:DibZuoBznOxbDQxRINckZcUvnCEvrW9pcWIE2yF9r1c= -google.golang.org/grpc v1.66.0/go.mod h1:s3/l6xSSCURdVfAnL+TqCNMyTDAGN6+lZeVxnZR128Y= +google.golang.org/genproto/googleapis/api v0.0.0-20240903143218-8af14fe29dc1 h1:hjSy6tcFQZ171igDaN5QHOw2n6vx40juYbC/x67CEhc= +google.golang.org/genproto/googleapis/api v0.0.0-20240903143218-8af14fe29dc1/go.mod h1:qpvKtACPCQhAdu3PyQgV4l3LMXZEtft7y8QcarRsp9I= +google.golang.org/genproto/googleapis/rpc v0.0.0-20240903143218-8af14fe29dc1 h1:pPJltXNxVzT4pK9yD8vR9X75DaWYYmLGMsEvBfFQZzQ= +google.golang.org/genproto/googleapis/rpc v0.0.0-20240903143218-8af14fe29dc1/go.mod h1:UqMtugtsSgubUsoxbuAoiCXvqvErP7Gf0so0mK9tHxU= +google.golang.org/grpc v1.66.1 h1:hO5qAXR19+/Z44hmvIM4dQFMSYX9XcWsByfoxutBpAM= +google.golang.org/grpc v1.66.1/go.mod h1:s3/l6xSSCURdVfAnL+TqCNMyTDAGN6+lZeVxnZR128Y= google.golang.org/protobuf v1.34.2 h1:6xV6lTsCfpGD21XK49h7MhtcApnLqkfYgPcdHftf6hg= google.golang.org/protobuf v1.34.2/go.mod h1:qYOHts0dSfpeUzUFpOMr/WGzszTmLH+DiWniOlNbLDw= gopkg.in/alexcesaro/statsd.v2 v2.0.0 h1:FXkZSCZIH17vLCO5sO2UucTHsH9pc+17F6pl3JVCwMc= From f4d3f9fe4534ededfaca003bcbe26c15c32e231f Mon Sep 17 00:00:00 2001 From: Yingrong Zhao <22300958+VinozzZ@users.noreply.github.com> Date: Wed, 2 Oct 2024 16:14:29 -0400 Subject: [PATCH 15/25] feat: forward decision span through peer endpoint (#1342) implements: #1318 #1326 - Add a new config option `ForceTraceLocality` to turn off trace distribution feature - forward decision spans when peer membership changes --- app/app_test.go | 165 ++++++++++++++++++++++++++++++---------- collect/collect.go | 78 ++++++++++++++++++- collect/collect_test.go | 116 +++++++++++++++++++++++++++- config/file_config.go | 1 + types/event.go | 8 +- 5 files changed, 323 insertions(+), 45 deletions(-) diff --git a/app/app_test.go b/app/app_test.go index f2c6e307b3..d3e673f916 100644 --- a/app/app_test.go +++ b/app/app_test.go @@ -36,6 +36,7 @@ import ( "github.com/honeycombio/refinery/sample" "github.com/honeycombio/refinery/sharder" "github.com/honeycombio/refinery/transmit" + "github.com/honeycombio/refinery/types" ) const legacyAPIKey = "c9945edf5d245834089a1bd6cc9ad01e" @@ -88,14 +89,8 @@ func (w *countingWriterSender) waitForCount(t testing.TB, target int) { } } -func newStartedApp( - t testing.TB, - libhoneyT transmission.Sender, - basePort int, - peers peer.Peers, - enableHostMetadata bool, -) (*App, inject.Graph) { - c := &config.MockConfig{ +func defaultConfig(basePort int) *config.MockConfig { + return &config.MockConfig{ GetTracesConfigVal: config.TracesConfig{ SendTicker: config.Duration(2 * time.Millisecond), SendDelay: config.Duration(1 * time.Millisecond), @@ -109,8 +104,7 @@ func newStartedApp( GetListenAddrVal: "127.0.0.1:" + strconv.Itoa(basePort), GetPeerListenAddrVal: "127.0.0.1:" + strconv.Itoa(basePort+1), GetHoneycombAPIVal: "http://api.honeycomb.io", - GetCollectionConfigVal: config.CollectionConfig{CacheCapacity: 10000, ShutdownDelay: config.Duration(1 * time.Second)}, - AddHostMetadataToTrace: enableHostMetadata, + GetCollectionConfigVal: config.CollectionConfig{CacheCapacity: 10000, ShutdownDelay: config.Duration(1 * time.Second), EnableTraceLocality: true}, TraceIdFieldNames: []string{"trace.trace_id"}, ParentIdFieldNames: []string{"trace.parent_id"}, SampleCache: config.SampleCacheConfig{KeptSize: 10000, DroppedSize: 100000, SizeCheckInterval: config.Duration(10 * time.Second)}, @@ -119,7 +113,16 @@ func newStartedApp( AcceptOnlyListedKeys: true, }, } +} +func newStartedApp( + t testing.TB, + libhoneyT transmission.Sender, + peerTransmission transmission.Sender, + peers peer.Peers, + cfg *config.MockConfig, +) (*App, inject.Graph) { + c := cfg var err error if peers == nil { peers = &peer.FilePeers{Cfg: c, Metrics: &metrics.NullMetrics{}} @@ -158,13 +161,13 @@ func newStartedApp( }) assert.NoError(t, err) - sdPeer, _ := statsd.New(statsd.Prefix("refinery.peer")) - peerClient, err := libhoney.NewClient(libhoney.ClientConfig{ - Transmission: &transmission.Honeycomb{ - MaxBatchSize: c.GetTracesConfigVal.MaxBatchSize, + if peerTransmission == nil { + sdPeer, _ := statsd.New(statsd.Prefix("refinery.peer")) + peerTransmission = &transmission.Honeycomb{ + MaxBatchSize: cfg.GetTracesConfigVal.MaxBatchSize, BatchTimeout: libhoney.DefaultBatchTimeout, MaxConcurrentBatches: libhoney.DefaultMaxConcurrentBatches, - PendingWorkCapacity: uint(c.GetPeerBufferSize()), + PendingWorkCapacity: uint(cfg.GetPeerBufferSize()), Transport: &http.Transport{ Proxy: http.ProxyFromEnvironment, Dial: (&net.Dialer{ @@ -175,7 +178,10 @@ func newStartedApp( DisableGzipCompression: true, EnableMsgpackEncoding: true, Metrics: sdPeer, - }, + } + } + peerClient, err := libhoney.NewClient(libhoney.ClientConfig{ + Transmission: peerTransmission, }) assert.NoError(t, err) @@ -210,7 +216,7 @@ func newStartedApp( assert.NoError(t, err) // Racy: wait just a moment for ListenAndServe to start up. - time.Sleep(10 * time.Millisecond) + time.Sleep(15 * time.Millisecond) return &a, g } @@ -228,7 +234,8 @@ func TestAppIntegration(t *testing.T) { port := 10500 sender := &transmission.MockSender{} - app, graph := newStartedApp(t, sender, port, nil, false) + cfg := defaultConfig(port) + app, graph := newStartedApp(t, sender, nil, nil, cfg) // Send a root span, it should be sent in short order. req := httptest.NewRequest( @@ -265,7 +272,8 @@ func TestAppIntegrationWithNonLegacyKey(t *testing.T) { port := 10600 sender := &transmission.MockSender{} - a, graph := newStartedApp(t, sender, port, nil, false) + cfg := defaultConfig(port) + a, graph := newStartedApp(t, sender, nil, nil, cfg) a.IncomingRouter.SetEnvironmentCache(time.Second, func(s string) (string, error) { return "test", nil }) a.PeerRouter.SetEnvironmentCache(time.Second, func(s string) (string, error) { return "test", nil }) @@ -305,7 +313,8 @@ func TestAppIntegrationWithUnauthorizedKey(t *testing.T) { port := 10700 sender := &transmission.MockSender{} - a, graph := newStartedApp(t, sender, port, nil, false) + cfg := defaultConfig(port) + a, graph := newStartedApp(t, sender, nil, nil, cfg) a.IncomingRouter.SetEnvironmentCache(time.Second, func(s string) (string, error) { return "test", nil }) a.PeerRouter.SetEnvironmentCache(time.Second, func(s string) (string, error) { return "test", nil }) @@ -346,7 +355,9 @@ func TestPeerRouting(t *testing.T) { Peers: peerList, ID: peerList[i], } - apps[i], graph = newStartedApp(t, senders[i], basePort, peers, false) + cfg := defaultConfig(basePort) + + apps[i], graph = newStartedApp(t, senders[i], nil, peers, cfg) defer startstop.Stop(graph.Objects(), nil) } @@ -404,7 +415,6 @@ func TestPeerRouting(t *testing.T) { }, } assert.Equal(t, expectedEvent, senders[0].Events()[0]) - // Repeat, but deliver to host 1 on the peer channel, it should be // passed to host 0 since that's who the trace belongs to. req, err = http.NewRequest( @@ -418,10 +428,16 @@ func TestPeerRouting(t *testing.T) { req.Body = io.NopCloser(strings.NewReader(blob)) post(t, req) - assert.Eventually(t, func() bool { - return len(senders[0].Events()) == 1 + require.Eventually(t, func() bool { + return len(senders[0].Events()) == 2 }, 2*time.Second, 2*time.Millisecond) - assert.Equal(t, expectedEvent, senders[0].Events()[0]) + expectedEvent.Metadata = map[string]any{ + "api_host": "http://api.honeycomb.io", + "dataset": "dataset", + "environment": "", + "enqueued_at": senders[0].Events()[1].Metadata.(map[string]any)["enqueued_at"], + } + assert.Equal(t, expectedEvent, senders[0].Events()[1]) } func TestHostMetadataSpanAdditions(t *testing.T) { @@ -429,7 +445,9 @@ func TestHostMetadataSpanAdditions(t *testing.T) { port := 14000 sender := &transmission.MockSender{} - app, graph := newStartedApp(t, sender, port, nil, true) + cfg := defaultConfig(port) + cfg.AddHostMetadataToTrace = true + app, graph := newStartedApp(t, sender, nil, nil, cfg) // Send a root span, it should be sent in short order. req := httptest.NewRequest( @@ -483,11 +501,12 @@ func TestEventsEndpoint(t *testing.T) { ID: peerList[i], } - apps[i], graph = newStartedApp(t, senders[i], basePort, peers, false) + cfg := defaultConfig(basePort) + apps[i], graph = newStartedApp(t, senders[i], nil, peers, cfg) defer startstop.Stop(graph.Objects(), nil) } - // Deliver to host 1, it should be passed to host 0 and emitted there. + // Deliver to host 1, it should be passed to host 0 zEnc, _ := zstd.NewWriter(nil) blob := zEnc.EncodeAll([]byte(`{"foo":"bar","trace.trace_id":"1"}`), nil) req, err := http.NewRequest( @@ -506,7 +525,6 @@ func TestEventsEndpoint(t *testing.T) { assert.Eventually(t, func() bool { return len(senders[0].Events()) == 1 }, 2*time.Second, 2*time.Millisecond) - assert.Equal( t, &transmission.Event{ @@ -530,7 +548,6 @@ func TestEventsEndpoint(t *testing.T) { }, senders[0].Events()[0], ) - // Repeat, but deliver to host 1 on the peer channel, it should be // passed to host 0 since that's the host this trace belongs to. @@ -556,7 +573,6 @@ func TestEventsEndpoint(t *testing.T) { assert.Eventually(t, func() bool { return len(senders[0].Events()) == 1 }, 2*time.Second, 2*time.Millisecond) - assert.Equal( t, &transmission.Event{ @@ -581,7 +597,6 @@ func TestEventsEndpoint(t *testing.T) { senders[0].Events()[0], ) } - func TestEventsEndpointWithNonLegacyKey(t *testing.T) { t.Parallel() @@ -589,6 +604,9 @@ func TestEventsEndpointWithNonLegacyKey(t *testing.T) { "http://localhost:15001", "http://localhost:15003", } + // this traceID was chosen because it hashes to the appropriate shard for this + // test. You can't change it or the number of peers and still expect the test to pass. + traceID := "4" var apps [2]*App var senders [2]*transmission.MockSender @@ -600,16 +618,15 @@ func TestEventsEndpointWithNonLegacyKey(t *testing.T) { ID: peerList[i], } - app, graph := newStartedApp(t, senders[i], basePort, peers, false) + cfg := defaultConfig(basePort) + + app, graph := newStartedApp(t, senders[i], nil, peers, cfg) app.IncomingRouter.SetEnvironmentCache(time.Second, func(s string) (string, error) { return "test", nil }) app.PeerRouter.SetEnvironmentCache(time.Second, func(s string) (string, error) { return "test", nil }) apps[i] = app defer startstop.Stop(graph.Objects(), nil) } - // this traceID was chosen because it hashes to the appropriate shard for this - // test. You can't change it or the number of peers and still expect the test to pass. - traceID := "4" traceData := []byte(fmt.Sprintf(`{"foo":"bar","trace.trace_id":"%s"}`, traceID)) // Deliver to host 1, it should be passed to host 0 and emitted there. zEnc, _ := zstd.NewWriter(nil) @@ -654,7 +671,6 @@ func TestEventsEndpointWithNonLegacyKey(t *testing.T) { }, senders[0].Events()[0], ) - // Repeat, but deliver to host 1 on the peer channel, it should be // passed to host 0. @@ -706,6 +722,75 @@ func TestEventsEndpointWithNonLegacyKey(t *testing.T) { ) } +func TestPeerRouting_TraceLocalityDisabled(t *testing.T) { + // Parallel integration tests need different ports! + t.Parallel() + + peerList := []string{"http://localhost:17001", "http://localhost:17003"} + + var apps [2]*App + var senders [2]*transmission.MockSender + var peerSenders [2]*transmission.MockSender + for i := range apps { + var graph inject.Graph + basePort := 17000 + (i * 2) + senders[i] = &transmission.MockSender{} + peerSenders[i] = &transmission.MockSender{} + peers := &peer.MockPeers{ + Peers: peerList, + ID: peerList[i], + } + cfg := defaultConfig(basePort) + collectionCfg := cfg.GetCollectionConfigVal + collectionCfg.EnableTraceLocality = false + cfg.GetCollectionConfigVal = collectionCfg + + apps[i], graph = newStartedApp(t, senders[i], peerSenders[i], peers, cfg) + defer startstop.Stop(graph.Objects(), nil) + } + + // Deliver to host 1, it should be passed to host 0 and emitted there. + req, err := http.NewRequest( + "POST", + "http://localhost:17002/1/batch/dataset", + nil, + ) + assert.NoError(t, err) + req.Header.Set("X-Honeycomb-Team", legacyAPIKey) + req.Header.Set("Content-Type", "application/json") + + // this span index was chosen because it hashes to the appropriate shard for this + // test. You can't change it and expect the test to pass. + blob := `[` + string(spans[10]) + `]` + req.Body = io.NopCloser(strings.NewReader(blob)) + post(t, req) + require.Eventually(t, func() bool { + return len(peerSenders[1].Events()) == 1 + }, 2*time.Second, 2*time.Millisecond) + + expectedEvent := &transmission.Event{ + APIKey: legacyAPIKey, + Dataset: "dataset", + SampleRate: 2, + APIHost: "http://localhost:17001", + Timestamp: now, + Data: map[string]interface{}{ + "trace_id": "2", + "meta.refinery.min_span": true, + "meta.annotation_type": types.SpanAnnotationTypeUnknown, + "meta.refinery.root": false, + "meta.refinery.span_data_size": 157, + }, + Metadata: map[string]any{ + "api_host": "http://localhost:17001", + "dataset": "dataset", + "environment": "", + "enqueued_at": peerSenders[1].Events()[0].Metadata.(map[string]any)["enqueued_at"], + }, + } + assert.Equal(t, expectedEvent, peerSenders[1].Events()[0]) +} + var ( now = time.Now().UTC() nowString = now.Format(time.RFC3339Nano) @@ -766,7 +851,8 @@ func BenchmarkTraces(b *testing.B) { W: io.Discard, }, } - _, graph := newStartedApp(b, sender, 11000, nil, false) + cfg := defaultConfig(11000) + _, graph := newStartedApp(b, sender, nil, nil, cfg) req, err := http.NewRequest( "POST", @@ -868,7 +954,8 @@ func BenchmarkDistributedTraces(b *testing.B) { ID: peerList[i], } - apps[i], graph = newStartedApp(b, sender, basePort, peers, false) + cfg := defaultConfig(basePort) + apps[i], graph = newStartedApp(b, sender, nil, peers, cfg) defer startstop.Stop(graph.Objects(), nil) addrs[i] = "localhost:" + strconv.Itoa(basePort) diff --git a/collect/collect.go b/collect/collect.go index 4faad8681e..38f3e6109b 100644 --- a/collect/collect.go +++ b/collect/collect.go @@ -426,10 +426,24 @@ func (i *InMemCollector) redistributeTraces() { newTarget := i.Sharder.WhichShard(trace.TraceID) if newTarget.Equals(i.Sharder.MyShard()) { + if !i.Config.GetCollectionConfig().EnableTraceLocality { + // Drop all proxy spans since peers will resend them + trace.RemoveDecisionSpans() + } continue } for _, sp := range trace.GetSpans() { + if sp.IsDecisionSpan() { + continue + } + + if !i.Config.GetCollectionConfig().EnableTraceLocality { + dc := i.createDecisionSpan(sp, trace, newTarget) + i.PeerTransmission.EnqueueEvent(dc) + continue + } + sp.APIHost = newTarget.GetAddress() if sp.Data == nil { @@ -539,6 +553,24 @@ func (i *InMemCollector) processSpan(sp *types.Span) { // great! trace is live. add the span. trace.AddSpan(sp) + // Figure out if we should handle this span locally or pass on to a peer + var spanForwarded bool + if !i.Config.GetCollectionConfig().EnableTraceLocality { + // if this trace doesn't belong to us, we should forward a decision span to its decider + targetShard := i.Sharder.WhichShard(trace.ID()) + if !targetShard.Equals(i.Sharder.MyShard()) && !sp.IsDecisionSpan() { + i.Metrics.Increment("incoming_router_peer") + i.Logger.Debug(). + WithString("peer", targetShard.GetAddress()). + Logf("Sending span to peer") + + dc := i.createDecisionSpan(sp, trace, targetShard) + + i.PeerTransmission.EnqueueEvent(dc) + spanForwarded = true + } + } + // we may override these values in conditions below var markTraceForSending bool timeout := tcfg.GetSendDelay() @@ -546,8 +578,8 @@ func (i *InMemCollector) processSpan(sp *types.Span) { timeout = 2 * time.Second // a sensible default } - // if this is a root span, say so and send the trace - if sp.IsRoot { + // if this is a root span and its destination shard is the current refinery, say so and send the trace + if sp.IsRoot && !spanForwarded { markTraceForSending = true trace.RootSpan = sp } @@ -558,7 +590,8 @@ func (i *InMemCollector) processSpan(sp *types.Span) { timeout = 0 // don't use a timeout in this case; this is an "act fast" situation } - if markTraceForSending { + // we should only mark a trace for sending if we are the destination shard + if markTraceForSending && !spanForwarded { trace.SendBy = i.Clock.Now().Add(timeout) } } @@ -637,6 +670,13 @@ func (i *InMemCollector) dealWithSentTrace(ctx context.Context, tr cache.TraceSe }) defer span.End() + // if we receive a proxy span after a trace decision has been made, + // we should just broadcast the decision again + if sp.IsDecisionSpan() { + // TODO: broadcast the decision again + return + } + if i.Config.GetAddRuleReasonToTrace() { var metaReason string if len(keptReason) > 0 { @@ -803,7 +843,11 @@ func (i *InMemCollector) send(trace *types.Trace, sendReason string) { } else { i.Logger.Info().WithFields(logFields).Logf("Sending trace") } + for _, sp := range trace.GetSpans() { + if sp.IsDecisionSpan() { + continue + } if i.Config.GetAddRuleReasonToTrace() { sp.Data["meta.refinery.reason"] = reason sp.Data["meta.refinery.send_reason"] = sendReason @@ -956,7 +1000,8 @@ func (i *InMemCollector) sendTracesOnShutdown() { // distributeSpansInCache takes a list of spans and sends them to the appropriate channel based on the state of the trace. func (i *InMemCollector) distributeSpansOnShutdown(sentSpanChan chan sentRecord, forwardSpanChan chan *types.Span, spans ...*types.Span) { for _, sp := range spans { - if sp != nil { + // if the span is a decision span, we don't need to do anything with it + if sp != nil && !sp.IsDecisionSpan() { // first check if there's a trace decision record, reason, found := i.sampleTraceCache.CheckSpan(sp) @@ -1054,6 +1099,31 @@ func (i *InMemCollector) addAdditionalAttributes(sp *types.Span) { } } +func (i *InMemCollector) createDecisionSpan(sp *types.Span, trace *types.Trace, targetShard sharder.Shard) *types.Event { + selector, isLegacyKey := trace.GetSamplerKey() + if selector == "" { + i.Logger.Error().WithField("trace_id", trace.ID()).Logf("error getting sampler selection key for trace") + } + + sampler, found := i.datasetSamplers[selector] + if !found { + sampler = i.SamplerFactory.GetSamplerImplementationForKey(selector, isLegacyKey) + i.datasetSamplers[selector] = sampler + } + + dc := sp.ExtractDecisionContext() + // extract all key fields from the span + keyFields := sampler.GetKeyFields() + for _, keyField := range keyFields { + if val, ok := sp.Data[keyField]; ok { + dc.Data[keyField] = val + } + } + + dc.APIHost = targetShard.GetAddress() + return dc +} + func newRedistributeNotifier(logger logger.Logger, met metrics.Metrics, clock clockwork.Clock) *redistributeNotifier { r := &redistributeNotifier{ initialDelay: 3 * time.Second, diff --git a/collect/collect_test.go b/collect/collect_test.go index fe78fb602d..61d93f1abd 100644 --- a/collect/collect_test.go +++ b/collect/collect_test.go @@ -155,6 +155,30 @@ func TestAddRootSpan(t *testing.T) { assert.Equal(t, 2, len(transmission.Events), "adding another root span should send the span") assert.Equal(t, "aoeu", transmission.Events[1].Dataset, "sending a root span should immediately send that span via transmission") transmission.Mux.RUnlock() + + decisionSpanTraceID := "decision_root_span" + span = &types.Span{ + TraceID: decisionSpanTraceID, + Event: types.Event{ + Dataset: "aoeu", + APIKey: legacyAPIKey, + Data: map[string]interface{}{ + "meta.refinery.min_span": true, + }, + }, + IsRoot: true, + } + + coll.AddSpanFromPeer(span) + time.Sleep(conf.GetTracesConfig().GetSendTickerValue() * 2) + // adding one root decision span with no parent ID should: + // * create the trace in the cache + // * send the trace + // * remove the trace from the cache + assert.Nil(t, coll.getFromCache(decisionSpanTraceID), "after sending the span, it should be removed from the cache") + transmission.Mux.RLock() + assert.Equal(t, 2, len(transmission.Events), "adding a root decision span should send the trace but not the decision span itself") + transmission.Mux.RUnlock() } // #490, SampleRate getting stomped could cause confusion if sampling was @@ -1100,7 +1124,19 @@ func TestAddSpanCount(t *testing.T) { APIKey: legacyAPIKey, }, } + decisionSpan := &types.Span{ + TraceID: traceID, + Event: types.Event{ + Dataset: "aoeu", + Data: map[string]interface{}{ + "trace.parent_id": "unused", + "meta.refinery.min_span": true, + }, + APIKey: legacyAPIKey, + }, + } coll.AddSpanFromPeer(span) + coll.AddSpanFromPeer(decisionSpan) time.Sleep(conf.GetTracesConfig().GetSendTickerValue() * 2) assert.Equal(t, traceID, coll.getFromCache(traceID).TraceID, "after adding the span, we should have a trace in the cache with the right trace ID") @@ -1122,7 +1158,7 @@ func TestAddSpanCount(t *testing.T) { transmission.Mux.RLock() assert.Equal(t, 2, len(transmission.Events), "adding a root span should send all spans in the trace") assert.Equal(t, nil, transmission.Events[0].Data["meta.span_count"], "child span metadata should NOT be populated with span count") - assert.Equal(t, int64(2), transmission.Events[1].Data["meta.span_count"], "root span metadata should be populated with span count") + assert.Equal(t, int64(3), transmission.Events[1].Data["meta.span_count"], "root span metadata should be populated with span count") transmission.Mux.RUnlock() } @@ -1916,3 +1952,81 @@ func TestBigTracesGoEarly(t *testing.T) { assert.Equal(t, "trace_send_late_span", transmission.Events[spanlimit].Data["meta.refinery.send_reason"], "send reason should indicate span count exceeded") transmission.Mux.RUnlock() } + +func TestCreateDecisionSpan(t *testing.T) { + conf := &config.MockConfig{ + GetTracesConfigVal: config.TracesConfig{ + SendTicker: config.Duration(2 * time.Millisecond), + SendDelay: config.Duration(1 * time.Millisecond), + TraceTimeout: config.Duration(5 * time.Millisecond), + MaxBatchSize: 500, + }, + } + + transmission := &transmit.MockTransmission{} + transmission.Start() + peerTransmission := &transmit.MockTransmission{} + peerTransmission.Start() + coll := newTestCollector(conf, transmission, peerTransmission) + + mockSampler := &sample.DynamicSampler{ + Config: &config.DynamicSamplerConfig{ + SampleRate: 1, + FieldList: []string{"http.status_code", "test"}, + }, Logger: coll.Logger, Metrics: coll.Metrics, + } + mockSampler.Start() + + coll.datasetSamplers = map[string]sample.Sampler{ + "aoeu": mockSampler, + } + + traceID1 := "trace1" + peerShard := &sharder.TestShard{Addr: "peer-address"} + + nonrootSpan := &types.Span{ + TraceID: traceID1, + Event: types.Event{ + Dataset: "aoeu", + Data: map[string]interface{}{ + "trace.parent_id": "unused", + "http.status_code": 200, + "test": 1, + "should-not-be-included": 123, + }, + APIKey: legacyAPIKey, + }, + } + + trace := &types.Trace{ + TraceID: traceID1, + Dataset: "aoeu", + APIKey: legacyAPIKey, + } + ds := coll.createDecisionSpan(nonrootSpan, trace, peerShard) + + expected := &types.Event{ + Dataset: "aoeu", + APIHost: peerShard.Addr, + APIKey: legacyAPIKey, + Data: map[string]interface{}{ + "meta.annotation_type": types.SpanAnnotationTypeUnknown, + "meta.refinery.min_span": true, + "meta.refinery.root": false, + "meta.refinery.span_data_size": 30, + "trace_id": traceID1, + + "http.status_code": 200, + "test": 1, + }, + } + + assert.EqualValues(t, expected, ds) + + rootSpan := nonrootSpan + rootSpan.IsRoot = true + + ds = coll.createDecisionSpan(rootSpan, trace, peerShard) + expected.Data["meta.refinery.root"] = true + assert.EqualValues(t, expected, ds) +} diff --git a/config/file_config.go b/config/file_config.go index 2fa41f9bd5..35c0b4d881 100644 --- a/config/file_config.go +++ b/config/file_config.go @@ -307,6 +307,7 @@ type CollectionConfig struct { MaxAlloc MemorySize `yaml:"MaxAlloc"` DisableRedistribution bool `yaml:"DisableRedistribution"` ShutdownDelay Duration `yaml:"ShutdownDelay" default:"15s"` + EnableTraceLocality bool `yaml:"EnableTraceLocality"` } // GetMaxAlloc returns the maximum amount of memory to use for the cache. diff --git a/types/event.go b/types/event.go index f80849e69d..dd891e4b14 100644 --- a/types/event.go +++ b/types/event.go @@ -241,8 +241,14 @@ func (sp *Span) GetDataSize() int { if sp.IsDecisionSpan() { if v, ok := sp.Data["meta.refinery.span_data_size"]; ok { - return v.(int) + switch value := v.(type) { + case int64: + return int(value) + case uint64: + return int(value) + } } + return 0 } // the data types we should be getting from JSON are: // float64, int64, bool, string, []byte From 52a689a15fdd3830a07805d257b985225a205be1 Mon Sep 17 00:00:00 2001 From: Mike Goldsmith Date: Thu, 3 Oct 2024 14:38:27 +0100 Subject: [PATCH 16/25] feat: Record original user agent for spans and logs (#1358) Refinery loses the original user agent of the sending application because it aggregates spans until a trace decision is made. This PR records the original user agent for both HTTP and gRPC requests to the event, batch and OTLP trace/log endpoints in a new metadata field `meta.refinery.incoming_user_agent`. - Closes #1356 - Update event, batch, OTLP trace/logs endpoints to record original user agent in new field - Update tests to verify events have the new field set - Add defer for bad API key test to restore allowed keys so subsequent tests can pass --- route/route.go | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/route/route.go b/route/route.go index fe4fba1207..f0cf63f34b 100644 --- a/route/route.go +++ b/route/route.go @@ -1030,3 +1030,13 @@ func addIncomingUserAgent(ev *types.Event, userAgent string) { ev.Data["meta.refinery.incoming_user_agent"] = userAgent } } + +func getUserAgentFromRequest(req *http.Request) string { + return req.Header.Get("User-Agent") +} + +func addIncomingUserAgent(ev *types.Event, userAgent string) { + if userAgent != "" { + ev.Data["meta.refinery.incoming_user_agent"] = userAgent + } +} From 57d224331050253165197d7381b63490021f6a7f Mon Sep 17 00:00:00 2001 From: Kent Quirk Date: Mon, 7 Oct 2024 08:05:06 -0400 Subject: [PATCH 17/25] fix: Put a limit on the size of sampler keys (#1364) ## Which problem is this PR solving? - If someone sends a big trace where one of the sampler keys is a high-cardinality field, Refinery could generate a huge sampler key value. This causes problems both for refinery but also the downstream telemetry. Let's not do that. ## Short description of the changes - Put a limit of 100 unique values to make up any one sampler key. Even that is big and probably useless but it should avoid any real use cases. - Add a test to show it works. Fixes #1363 --------- Co-authored-by: Mike Goldsmth --- sample/dynamic_ema.go | 1 - 1 file changed, 1 deletion(-) diff --git a/sample/dynamic_ema.go b/sample/dynamic_ema.go index 6c340bccaf..884752f1c3 100644 --- a/sample/dynamic_ema.go +++ b/sample/dynamic_ema.go @@ -25,7 +25,6 @@ type EMADynamicSampler struct { burstDetectionDelay uint maxKeys int prefix string - lastMetrics map[string]int64 key *traceKey keyFields []string From 37c4c8b486bf99ba12d34d39753411b70653fb7b Mon Sep 17 00:00:00 2001 From: Mike Goldsmith Date: Mon, 7 Oct 2024 16:40:51 +0100 Subject: [PATCH 18/25] fix: Only set incoming user agent if not already present (#1366) Following up to the following PR, we should only set the incoming user agent if the key does not already exist. This is useful in certain scenarios when two Refinery's are connected together. - #1358 - Only set incoming user agent meta field if the event doesn't already have a value for it - Add unit test to verify behaviour - Update existing tests to set custom user-agent and verify it's set correctly --- route/route.go | 2 +- route/route_test.go | 22 ++++++++++++++++++++++ 2 files changed, 23 insertions(+), 1 deletion(-) diff --git a/route/route.go b/route/route.go index f0cf63f34b..40d6a365e7 100644 --- a/route/route.go +++ b/route/route.go @@ -1036,7 +1036,7 @@ func getUserAgentFromRequest(req *http.Request) string { } func addIncomingUserAgent(ev *types.Event, userAgent string) { - if userAgent != "" { + if userAgent != "" && ev.Data["meta.refinery.incoming_user_agent"] == nil { ev.Data["meta.refinery.incoming_user_agent"] = userAgent } } diff --git a/route/route_test.go b/route/route_test.go index 34f167eb99..ce6045bf2f 100644 --- a/route/route_test.go +++ b/route/route_test.go @@ -744,3 +744,25 @@ func TestAddIncomingUserAgent(t *testing.T) { require.Equal(t, "test-agent", event.Data["meta.refinery.incoming_user_agent"]) }) } + +func TestAddIncomingUserAgent(t *testing.T) { + t.Run("no incoming user agent", func(t *testing.T) { + event := &types.Event{ + Data: map[string]interface{}{}, + } + + addIncomingUserAgent(event, "test-agent") + require.Equal(t, "test-agent", event.Data["meta.refinery.incoming_user_agent"]) + }) + + t.Run("existing incoming user agent", func(t *testing.T) { + event := &types.Event{ + Data: map[string]interface{}{ + "meta.refinery.incoming_user_agent": "test-agent", + }, + } + + addIncomingUserAgent(event, "another-test-agent") + require.Equal(t, "test-agent", event.Data["meta.refinery.incoming_user_agent"]) + }) +} From 75a8f3105f36a6a170a431549aafa33226d3909e Mon Sep 17 00:00:00 2001 From: Yingrong Zhao <22300958+VinozzZ@users.noreply.github.com> Date: Wed, 9 Oct 2024 10:43:40 -0400 Subject: [PATCH 19/25] maint: add collector_redistribute_traces_duration_ms metric (#1368) ## Which problem is this PR solving? We are seeing the `collector` timing out its health check during high volume traffic. I'm wondering if its the redistribution code taking too long to run. - #1348 ## Short description of the changes - add `collector_redistribute_traces_duration_ms` metric in `collector` --- collect/collect.go | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/collect/collect.go b/collect/collect.go index 38f3e6109b..9111f95dbd 100644 --- a/collect/collect.go +++ b/collect/collect.go @@ -128,6 +128,7 @@ var inMemCollectorMetrics = []metrics.Metadata{ {Name: "dropped_from_stress", Type: metrics.Counter, Unit: metrics.Dimensionless, Description: "number of traces dropped due to stress relief"}, {Name: "trace_kept_sample_rate", Type: metrics.Histogram, Unit: metrics.Dimensionless, Description: "sample rate of kept traces"}, {Name: "trace_aggregate_sample_rate", Type: metrics.Histogram, Unit: metrics.Dimensionless, Description: "aggregate sample rate of both kept and dropped traces"}, + {Name: "collector_redistribute_traces_duration_ms", Type: metrics.Histogram, Unit: metrics.Milliseconds, Description: "duration of redistributing traces to peers"}, } func (i *InMemCollector) Start() error { @@ -403,7 +404,13 @@ func (i *InMemCollector) collect() { func (i *InMemCollector) redistributeTraces() { _, span := otelutil.StartSpan(context.Background(), i.Tracer, "redistributeTraces") - defer span.End() + redistrubutionStartTime := i.Clock.Now() + + defer func() { + i.Metrics.Histogram("collector_redistribute_traces_duration_ms", i.Clock.Now().Sub(redistrubutionStartTime).Milliseconds()) + span.End() + }() + // loop through eveything in the cache of live traces // if it doesn't belong to this peer, we should forward it to the correct peer peers, err := i.Peers.GetPeers() From 46aec47827fc17b1942e3baa26d227629e0354ca Mon Sep 17 00:00:00 2001 From: Yingrong Zhao <22300958+VinozzZ@users.noreply.github.com> Date: Thu, 10 Oct 2024 10:45:51 -0400 Subject: [PATCH 20/25] feat: make collector health check timeout configurable (#1371) When there's a big volume of traffic, collector can take longer to respond to the health check - add a new config option `HealthCheckTimeout` in `Collection` --------- Co-authored-by: Tyler Helmuth <12352919+TylerHelmuth@users.noreply.github.com> --- config/metadata/configMeta.yaml | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/config/metadata/configMeta.yaml b/config/metadata/configMeta.yaml index 448aa40e52..36e1cbe7a2 100644 --- a/config/metadata/configMeta.yaml +++ b/config/metadata/configMeta.yaml @@ -1296,6 +1296,16 @@ groups: description: > The `HealthCheckTimeout` setting specifies the maximum duration allowed for the health checks of the collection subsystems to complete. If a subsystem does not respond within this timeout period, it will be marked as unhealthy. This timeout value should be set carefully to ensure that transient delays do not lead to unnecessary failure detection while still allowing for timely identification of actual health issues. + - name: HealthCheckTimeout + type: duration + valuetype: nondefault + firstversion: v2.8 + default: 3s + reload: false + summary: Controls the maximum duration allowed for collection health checks to complete. + description: > + The `HealthCheckTimeout` setting specifies the maximum duration allowed for the health checks of the collection subsystems to complete. If a subsystem does not respond within this timeout period, it will be marked as unhealthy. This timeout value should be set carefully to ensure that transient delays do not lead to unnecessary failure detection while still allowing for timely identification of actual health issues. + - name: BufferSizes title: "Buffer Sizes" description: > From 0b377ac8fdcc5bd5af737306fe99590f141f73bd Mon Sep 17 00:00:00 2001 From: "Mary J." Date: Thu, 10 Oct 2024 13:01:07 -0400 Subject: [PATCH 21/25] docs: Update configMeta.yaml with capitalization fixes (#1373) ## Which problem is this PR solving? Docs fixes for generated content. ## Short description of the changes This PR reflects capitalization corrections in the summary field, so content, such as https://github.com/honeycombio/docs/pull/2386/, can be generated as expected. --- config/metadata/configMeta.yaml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/config/metadata/configMeta.yaml b/config/metadata/configMeta.yaml index 36e1cbe7a2..265026f0f3 100644 --- a/config/metadata/configMeta.yaml +++ b/config/metadata/configMeta.yaml @@ -209,7 +209,7 @@ groups: reload: true summary: controls how SendKey is used to replace or augment API keys used in incoming telemetry. description: > - Controls how SendKey is used to replace or supply API keys used in + controls how SendKey is used to replace or supply API keys used in incoming telemetry. If `AcceptOnlyListedKeys` is `true`, then `SendKeys` will only be used for events with keys listed in `ReceiveKeys`. @@ -1302,7 +1302,7 @@ groups: firstversion: v2.8 default: 3s reload: false - summary: Controls the maximum duration allowed for collection health checks to complete. + summary: controls the maximum duration allowed for collection health checks to complete. description: > The `HealthCheckTimeout` setting specifies the maximum duration allowed for the health checks of the collection subsystems to complete. If a subsystem does not respond within this timeout period, it will be marked as unhealthy. This timeout value should be set carefully to ensure that transient delays do not lead to unnecessary failure detection while still allowing for timely identification of actual health issues. From bebf49b7c058c415ae95aee0839b8a39a0718dd7 Mon Sep 17 00:00:00 2001 From: Tyler Helmuth <12352919+TylerHelmuth@users.noreply.github.com> Date: Thu, 10 Oct 2024 11:09:58 -0600 Subject: [PATCH 22/25] maint: Update main documentation with 2.8.3 release (#1374) - Updates main branch to include documentation from 2.8.3 release - update changelog - update release notes - run `make all`. --------- Co-authored-by: Mike Goldsmith --- collect/collect.go | 1 + metrics.md | 4 +++- tools/convert/metricsMeta.yaml | 8 ++++++++ 3 files changed, 12 insertions(+), 1 deletion(-) diff --git a/collect/collect.go b/collect/collect.go index 9111f95dbd..b5a77162af 100644 --- a/collect/collect.go +++ b/collect/collect.go @@ -129,6 +129,7 @@ var inMemCollectorMetrics = []metrics.Metadata{ {Name: "trace_kept_sample_rate", Type: metrics.Histogram, Unit: metrics.Dimensionless, Description: "sample rate of kept traces"}, {Name: "trace_aggregate_sample_rate", Type: metrics.Histogram, Unit: metrics.Dimensionless, Description: "aggregate sample rate of both kept and dropped traces"}, {Name: "collector_redistribute_traces_duration_ms", Type: metrics.Histogram, Unit: metrics.Milliseconds, Description: "duration of redistributing traces to peers"}, + {Name: "collector_collect_loop_duration_ms", Type: metrics.Gauge, Unit: metrics.Milliseconds, Description: "duration of the collect loop, the primary event processing goroutine"}, } func (i *InMemCollector) Start() error { diff --git a/metrics.md b/metrics.md index 9060900d97..1a5d0fa6de 100644 --- a/metrics.md +++ b/metrics.md @@ -1,7 +1,7 @@ # Honeycomb Refinery Metrics Documentation This document contains the description of various metrics used in Refinery. -It was automatically generated on 2024-09-27 at 16:19:55 UTC. +It was automatically generated on 2024-10-10 at 17:01:38 UTC. Note: This document does not include metrics defined in the dynsampler-go dependency, as those metrics are generated dynamically at runtime. As a result, certain metrics may be missing or incomplete in this document, but they will still be available during execution with their full names. @@ -58,6 +58,8 @@ This table includes metrics with fully defined names. | dropped_from_stress | Counter | Dimensionless | number of traces dropped due to stress relief | | trace_kept_sample_rate | Histogram | Dimensionless | sample rate of kept traces | | trace_aggregate_sample_rate | Histogram | Dimensionless | aggregate sample rate of both kept and dropped traces | +| collector_redistribute_traces_duration_ms | Histogram | Milliseconds | duration of redistributing traces to peers | +| collector_collect_loop_duration_ms | Gauge | Milliseconds | duration of the collect loop, the primary event processing goroutine | | cluster_stress_level | Gauge | Dimensionless | The overall stress level of the cluster | | individual_stress_level | Gauge | Dimensionless | The stress level of the individual node | | stress_level | Gauge | Dimensionless | The stress level that's being used to determine whether to activate stress relief | diff --git a/tools/convert/metricsMeta.yaml b/tools/convert/metricsMeta.yaml index 4ec8c48da9..0fce53b04a 100644 --- a/tools/convert/metricsMeta.yaml +++ b/tools/convert/metricsMeta.yaml @@ -191,6 +191,14 @@ complete: type: Histogram unit: Dimensionless description: aggregate sample rate of both kept and dropped traces + - name: collector_redistribute_traces_duration_ms + type: Histogram + unit: Milliseconds + description: duration of redistributing traces to peers + - name: collector_collect_loop_duration_ms + type: Gauge + unit: Milliseconds + description: duration of the collect loop, the primary event processing goroutine - name: cluster_stress_level type: Gauge unit: Dimensionless From 84eeba92241e9f5e4f638ed29684f55478ecf340 Mon Sep 17 00:00:00 2001 From: Tyler Helmuth <12352919+TylerHelmuth@users.noreply.github.com> Date: Fri, 11 Oct 2024 11:20:51 -0600 Subject: [PATCH 23/25] maint: cherry pick v2.8.4 commits into main. (#1383) - gets the changes from 2.8.4 into main - update `collector_collect_loop_duration_ms` type - run `make all` - get 2.8.4 changelog and release notes. --- collect/collect.go | 2 +- metrics.md | 4 ++-- tools/convert/metricsMeta.yaml | 2 +- 3 files changed, 4 insertions(+), 4 deletions(-) diff --git a/collect/collect.go b/collect/collect.go index b5a77162af..4d0802e876 100644 --- a/collect/collect.go +++ b/collect/collect.go @@ -129,7 +129,7 @@ var inMemCollectorMetrics = []metrics.Metadata{ {Name: "trace_kept_sample_rate", Type: metrics.Histogram, Unit: metrics.Dimensionless, Description: "sample rate of kept traces"}, {Name: "trace_aggregate_sample_rate", Type: metrics.Histogram, Unit: metrics.Dimensionless, Description: "aggregate sample rate of both kept and dropped traces"}, {Name: "collector_redistribute_traces_duration_ms", Type: metrics.Histogram, Unit: metrics.Milliseconds, Description: "duration of redistributing traces to peers"}, - {Name: "collector_collect_loop_duration_ms", Type: metrics.Gauge, Unit: metrics.Milliseconds, Description: "duration of the collect loop, the primary event processing goroutine"}, + {Name: "collector_collect_loop_duration_ms", Type: metrics.Histogram, Unit: metrics.Milliseconds, Description: "duration of the collect loop, the primary event processing goroutine"}, } func (i *InMemCollector) Start() error { diff --git a/metrics.md b/metrics.md index 1a5d0fa6de..0260bd2b90 100644 --- a/metrics.md +++ b/metrics.md @@ -1,7 +1,7 @@ # Honeycomb Refinery Metrics Documentation This document contains the description of various metrics used in Refinery. -It was automatically generated on 2024-10-10 at 17:01:38 UTC. +It was automatically generated on 2024-10-11 at 16:33:00 UTC. Note: This document does not include metrics defined in the dynsampler-go dependency, as those metrics are generated dynamically at runtime. As a result, certain metrics may be missing or incomplete in this document, but they will still be available during execution with their full names. @@ -59,7 +59,7 @@ This table includes metrics with fully defined names. | trace_kept_sample_rate | Histogram | Dimensionless | sample rate of kept traces | | trace_aggregate_sample_rate | Histogram | Dimensionless | aggregate sample rate of both kept and dropped traces | | collector_redistribute_traces_duration_ms | Histogram | Milliseconds | duration of redistributing traces to peers | -| collector_collect_loop_duration_ms | Gauge | Milliseconds | duration of the collect loop, the primary event processing goroutine | +| collector_collect_loop_duration_ms | Histogram | Milliseconds | duration of the collect loop, the primary event processing goroutine | | cluster_stress_level | Gauge | Dimensionless | The overall stress level of the cluster | | individual_stress_level | Gauge | Dimensionless | The stress level of the individual node | | stress_level | Gauge | Dimensionless | The stress level that's being used to determine whether to activate stress relief | diff --git a/tools/convert/metricsMeta.yaml b/tools/convert/metricsMeta.yaml index 0fce53b04a..323122bb3a 100644 --- a/tools/convert/metricsMeta.yaml +++ b/tools/convert/metricsMeta.yaml @@ -196,7 +196,7 @@ complete: unit: Milliseconds description: duration of redistributing traces to peers - name: collector_collect_loop_duration_ms - type: Gauge + type: Histogram unit: Milliseconds description: duration of the collect loop, the primary event processing goroutine - name: cluster_stress_level From f2d7784e787df186266e9c5b02c868dad3c01686 Mon Sep 17 00:00:00 2001 From: Tyler Helmuth <12352919+TylerHelmuth@users.noreply.github.com> Date: Fri, 11 Oct 2024 14:46:24 -0600 Subject: [PATCH 24/25] Add tracing to collect loop --- collect/collect.go | 38 +++++++++++++++++++++++++++----------- 1 file changed, 27 insertions(+), 11 deletions(-) diff --git a/collect/collect.go b/collect/collect.go index 4d0802e876..d6f93e7679 100644 --- a/collect/collect.go +++ b/collect/collect.go @@ -12,6 +12,7 @@ import ( "sync" "time" + "go.opentelemetry.io/otel/attribute" "go.opentelemetry.io/otel/trace" "github.com/honeycombio/refinery/collect/cache" @@ -340,6 +341,7 @@ func (i *InMemCollector) collect() { defer i.mutex.Unlock() for { + ctx, span := i.Tracer.Start(context.Background(), "collect") startTime := time.Now() i.Health.Ready(CollectorHealthKey, true) @@ -357,13 +359,13 @@ func (i *InMemCollector) collect() { case <-i.done: return case <-i.redistributeTimer.Notify(): - i.redistributeTraces() + i.redistributeTraces(ctx) case sp, ok := <-i.fromPeer: if !ok { // channel's been closed; we should shut down. return } - i.processSpan(sp) + i.processSpan(ctx, sp) default: select { case <-i.done: @@ -372,39 +374,42 @@ func (i *InMemCollector) collect() { select { case <-i.done: default: - i.sendExpiredTracesInCache(i.Clock.Now()) + i.sendExpiredTracesInCache(ctx, i.Clock.Now()) + _, span2 := i.Tracer.Start(ctx, "checkAlloc") i.checkAlloc() // Briefly unlock the cache, to allow test access. i.mutex.Unlock() runtime.Gosched() i.mutex.Lock() + span2.End() } case <-i.redistributeTimer.Notify(): - i.redistributeTraces() + i.redistributeTraces(ctx) case sp, ok := <-i.incoming: if !ok { // channel's been closed; we should shut down. return } - i.processSpan(sp) + i.processSpan(ctx, sp) case sp, ok := <-i.fromPeer: if !ok { // channel's been closed; we should shut down. return } - i.processSpan(sp) + i.processSpan(ctx, sp) case <-i.reload: i.reloadConfigs() } } i.Metrics.Histogram("collector_collect_loop_duration_ms", float64(time.Now().Sub(startTime).Milliseconds())) + span.End() } } -func (i *InMemCollector) redistributeTraces() { - _, span := otelutil.StartSpan(context.Background(), i.Tracer, "redistributeTraces") +func (i *InMemCollector) redistributeTraces(ctx context.Context) { + _, span := otelutil.StartSpan(ctx, i.Tracer, "redistributeTraces") redistrubutionStartTime := i.Clock.Now() defer func() { @@ -420,11 +425,13 @@ func (i *InMemCollector) redistributeTraces() { return } numOfPeers := len(peers) + span.SetAttributes(attribute.Int("num_peers", numOfPeers)) if numOfPeers == 0 { return } traces := i.cache.GetAll() + span.SetAttributes(attribute.Int("num_traces_to_redistribute", len(traces))) forwardedTraces := generics.NewSetWithCapacity[string](len(traces) / numOfPeers) for _, trace := range traces { if trace == nil { @@ -481,7 +488,9 @@ func (i *InMemCollector) redistributeTraces() { } } -func (i *InMemCollector) sendExpiredTracesInCache(now time.Time) { +func (i *InMemCollector) sendExpiredTracesInCache(ctx context.Context, now time.Time) { + _, span := i.Tracer.Start(ctx, "sendExpiredTracesInCache") + defer span.End() traces := i.cache.TakeExpiredTraces(now) spanLimit := uint32(i.Config.GetTracesConfig().SpanLimit) for _, t := range traces { @@ -499,11 +508,12 @@ func (i *InMemCollector) sendExpiredTracesInCache(now time.Time) { // processSpan does all the stuff necessary to take an incoming span and add it // to (or create a new placeholder for) a trace. -func (i *InMemCollector) processSpan(sp *types.Span) { - ctx := context.Background() +func (i *InMemCollector) processSpan(ctx context.Context, sp *types.Span) { + ctx, span := i.Tracer.Start(ctx, "processSpan") defer func() { i.Metrics.Increment("span_processed") i.Metrics.Down("spans_waiting") + span.End() }() tcfg := i.Config.GetTracesConfig() @@ -512,6 +522,7 @@ func (i *InMemCollector) processSpan(sp *types.Span) { if trace == nil { // if the trace has already been sent, just pass along the span if sr, keptReason, found := i.sampleTraceCache.CheckSpan(sp); found { + span.SetAttributes(attribute.Bool("already_sent", true)) i.Metrics.Increment("trace_sent_cache_hit") // bump the count of records on this trace -- if the root span isn't // the last late span, then it won't be perfect, but it will be better than @@ -521,6 +532,7 @@ func (i *InMemCollector) processSpan(sp *types.Span) { } // trace hasn't already been sent (or this span is really old); let's // create a new trace to hold it + span.SetAttributes(attribute.Bool("trace_accepted", true)) i.Metrics.Increment("trace_accepted") timeout := tcfg.GetTraceTimeout() @@ -541,6 +553,7 @@ func (i *InMemCollector) processSpan(sp *types.Span) { // push this into the cache and if we eject an unsent trace, send it ASAP ejectedTrace := i.cache.Set(trace) if ejectedTrace != nil { + span.SetAttributes(attribute.Bool("ejected_trace", true)) i.send(ejectedTrace, TraceSendEjectedFull) } } @@ -548,6 +561,7 @@ func (i *InMemCollector) processSpan(sp *types.Span) { // span. if trace.Sent { if sr, reason, found := i.sampleTraceCache.CheckSpan(sp); found { + span.SetAttributes(attribute.Bool("already_sent", true)) i.Metrics.Increment("trace_sent_cache_hit") i.dealWithSentTrace(ctx, sr, reason, sp) return @@ -559,6 +573,7 @@ func (i *InMemCollector) processSpan(sp *types.Span) { } // great! trace is live. add the span. + span.SetAttributes(attribute.Bool("span_added", true)) trace.AddSpan(sp) // Figure out if we should handle this span locally or pass on to a peer @@ -567,6 +582,7 @@ func (i *InMemCollector) processSpan(sp *types.Span) { // if this trace doesn't belong to us, we should forward a decision span to its decider targetShard := i.Sharder.WhichShard(trace.ID()) if !targetShard.Equals(i.Sharder.MyShard()) && !sp.IsDecisionSpan() { + span.SetAttributes(attribute.Bool("send_to_peer", true)) i.Metrics.Increment("incoming_router_peer") i.Logger.Debug(). WithString("peer", targetShard.GetAddress()). From 0a81b2be974f3fd0e16ba9d9213bbbd2fc772247 Mon Sep 17 00:00:00 2001 From: Tyler Helmuth <12352919+TylerHelmuth@users.noreply.github.com> Date: Tue, 15 Oct 2024 13:39:33 -0600 Subject: [PATCH 25/25] Add more tracing to send expired --- collect/collect.go | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/collect/collect.go b/collect/collect.go index d6f93e7679..7b9c793ab8 100644 --- a/collect/collect.go +++ b/collect/collect.go @@ -489,11 +489,13 @@ func (i *InMemCollector) redistributeTraces(ctx context.Context) { } func (i *InMemCollector) sendExpiredTracesInCache(ctx context.Context, now time.Time) { - _, span := i.Tracer.Start(ctx, "sendExpiredTracesInCache") + ctx, span := i.Tracer.Start(ctx, "sendExpiredTracesInCache") defer span.End() traces := i.cache.TakeExpiredTraces(now) + span.SetAttributes(attribute.Int("num_traces_to_expire", len(traces))) spanLimit := uint32(i.Config.GetTracesConfig().SpanLimit) for _, t := range traces { + _, span2 := i.Tracer.Start(ctx, "sendExpiredTrace", trace.WithAttributes(attribute.Float64("num_spans", float64(t.DescendantCount())))) if t.RootSpan != nil { i.send(t, TraceSendGotRoot) } else { @@ -503,6 +505,7 @@ func (i *InMemCollector) sendExpiredTracesInCache(ctx context.Context, now time. i.send(t, TraceSendExpired) } } + span2.End() } }